13 files changed, 3644 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
new file mode 100644
index 000000000..d2f42de53
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation op, typename ScalarType>
+inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+  auto res = ScalarType(0);
+
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      res = a & b;
+      break;
+    case BinaryLogicalOperation::OR:
+      res = a | b;
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <BinaryLogicalOperation op, typename VectorType>
+inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b)
+{
+  VectorType res = {0, 0, 0, 0};
+
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      res = wrapper::vand(a, b);
+      break;
+    case BinaryLogicalOperation::OR:
+      res = wrapper::vorr(a, b);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <BinaryLogicalOperation op>
+inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b)
+{
+  uint8x16x4_t out = {{
+      elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]),
+      elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]),
+  }};
+  return out;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_logic_op_broadcast(const VectorType &a,
+                                                 const ScalarType &broadcast_value,
+                                                 const bool reorder)
+{
+  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+  return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x,
+                                     const ScalarType *input1_ptr, const ScalarType *input2_ptr,
+                                     ScalarType *output_ptr)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq(input1_ptr + x);
+    const auto b = wrapper::vloadq(input2_ptr + x);
+    wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b));
+  }
+  return x;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x,
+                                               int window_step_x,
+                                               const ScalarType *non_broadcast_input_ptr,
+                                               const ScalarType &broadcast_value,
+                                               ScalarType *output_ptr, const bool reorder)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+    wrapper::vstore(output_ptr + x,
+                    elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder));
+  }
+  return x;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+                          const Window &window)
+{
+  elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>,
+                 &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>,
+                 &elementwise_logic_op_loop<op, ScalarType, VectorType>);
+}
+
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
+    const ITensor *input1, const ITensor *input2, ITensor *output,
+    std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+{
+  std::string function_to_call("op_");
+  function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+  function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+  function_to_call += string_from_data_type(output->info()->data_type());
+
+  auto it = map_function.find(function_to_call);
+
+  if (it != map_function.end())
+  {
+    auto func = it->second;
+    return [func](const ITensor *input1, const ITensor *input2, ITensor *output,
+                  const Window &window) { func(input1, input2, output, window); };
+  }
+  return nullptr;
+}
+
+template <BinaryLogicalOperation op>
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+  static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
+      {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+      {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+
+  return configure_func(input1, input2, output, map_function);
+}
+
+void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1,
+                                               const ITensor *input2, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+  configure_common(input1, input2, output);
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output);
+      break;
+    case BinaryLogicalOperation::OR:
+      _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1,
+                                                          const ITensorInfo &input2,
+                                                          const ITensorInfo &output)
+{
+  // Validate in case of configured output
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8,
+                                                         DataType::QASYMM8);
+  }
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+
+  const TensorShape out_shape =
+      TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  // Validate in case of configured output
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+
+  return Status{};
+}
+
+Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
+                                                const ITensorInfo *input1,
+                                                const ITensorInfo *input2,
+                                                const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(op);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+  return Status{};
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
new file mode 100644
index 000000000..7e4fc129b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          SubDataType input_subtype)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
+                                                       DataType::QASYMM8, DataType::U32,
+                                                       DataType::S32, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
+                              input->data_type() != DataType::U8);
+
+  if (output->tensor_shape().total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+                                                         DataType::QASYMM8, DataType::U32,
+                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps());
+
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
+
+  // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+  return std::make_tuple(Status{}, win);
+}
+
+typedef struct bool8x16
+{
+  uint8x16_t val;
+} bool8x16_t;
+
+static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
+
+template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
+template <> inline uint8x16_t vcast(const bool8x16_t &v)
+{
+  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+  return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+}
+
+template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
+{
+  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+  const uint32x4x4_t ret = {{
+      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
+      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
+      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
+      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
+  }};
+
+  return ret;
+}
+
+template <> inline int32x4x4_t vcast(const bool8x16_t &v)
+{
+  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+  const int32x4x4_t ret = {{
+      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+  }};
+
+  return ret;
+}
+
+template <> inline float32x4x4_t vcast(const bool8x16_t &v)
+{
+  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+  const float32x4x4_t ret = {{
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+  }};
+
+  return ret;
+}
+
+template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
+{
+  const uint32x4x4_t ret = {{
+      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
+      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
+      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
+      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
+  }};
+
+  return ret;
+}
+
+template <> inline int32x4x4_t vcast(const uint8x16_t &v)
+{
+  const int32x4x4_t ret = {{
+      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+  }};
+
+  return ret;
+}
+
+template <> inline float32x4x4_t vcast(const uint8x16_t &v)
+{
+  const float32x4x4_t ret = {{
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+  }};
+
+  return ret;
+}
+
+template <> inline uint8x16_t vcast(const int32x4x4_t &v)
+{
+  // Saturate cast
+  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
+                     vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
+}
+
+template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
+{
+  // Saturate cast
+  const uint32x4x4_t ret = {{
+      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
+                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
+      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
+                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
+      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
+                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
+      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
+                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
+  }};
+
+  return ret;
+}
+
+template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
+{
+  const float32x4x4_t ret = {{
+      vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
+      vcvtq_f32_s32(v.val[3]),
+  }};
+
+  return ret;
+}
+
+template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
+{
+  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
+                     vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
+}
+
+template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
+{
+  const int32x4x4_t ret = {{
+      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
+                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
+      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
+                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
+      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
+                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
+      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
+                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
+  }};
+
+  return ret;
+}
+
+template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
+{
+  const float32x4x4_t ret = {{
+      vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
+      vcvtq_f32_u32(v.val[3]),
+  }};
+
+  return ret;
+}
+
+template <> inline uint8x16_t vcast(const float32x4x4_t &v)
+{
+  // Saturate cast
+  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
+                                             vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
+                     vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
+                                             vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
+}
+
+template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
+{
+  const uint32x4x4_t ret = {{
+      vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
+      vcvtq_u32_f32(v.val[3]),
+  }};
+
+  return ret;
+}
+
+template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
+{
+  const int32x4x4_t ret = {{
+      vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
+      vcvtq_s32_f32(v.val[3]),
+  }};
+
+  return ret;
+}
+
+template <typename T> struct cast_vector;
+template <> struct cast_vector<bool>
+{
+  using type = bool8x16_t;
+};
+template <> struct cast_vector<uint8_t>
+{
+  using type = uint8x16_t;
+};
+template <> struct cast_vector<uint32_t>
+{
+  using type = uint32x4x4_t;
+};
+template <> struct cast_vector<int32_t>
+{
+  using type = int32x4x4_t;
+};
+template <> struct cast_vector<float>
+{
+  using type = float32x4x4_t;
+};
+
+template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
+{
+  wrapper::vstore(ptr, v.val[0]);
+  wrapper::vstore(ptr + 4, v.val[1]);
+  wrapper::vstore(ptr + 8, v.val[2]);
+  wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
+{
+  wrapper::vstore(ptr, v);
+}
+
+inline bool8x16_t vloadq(const bool *ptr)
+{
+  bool8x16_t ret;
+  ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
+  return ret;
+}
+
+template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
+{
+  return wrapper::vloadq(ptr);
+}
+
+template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
+{
+  return vloadq(ptr);
+}
+
+template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
+{
+  return vld4q_u32(ptr);
+}
+
+template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
+{
+  return vld4q_s32(ptr);
+}
+
+template <> inline typename cast_vector<float>::type load_input(const float *ptr)
+{
+  return vld4q_f32(ptr);
+}
+
+template <typename T> inline T get_value(const T *ptr) { return *ptr; }
+
+template <> inline bool get_value(const bool *ptr)
+{
+  bool ret = (*ptr != 0);
+  return ret;
+}
+
+template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
+{
+  const int window_step_x = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  // Create iterators
+  Iterator in(input, win_collapsed);
+  Iterator out(output, win_collapsed);
+
+#ifdef __aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &) {
+        const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
+
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+          using from_vector = typename cast_vector<FromT>::type;
+          const from_vector vin = load_input(in_ptr + x);
+
+          switch (output->info()->data_type())
+          {
+            case DataType::U8:
+            {
+              using to_vector = typename cast_vector<uint8_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::QASYMM8:
+            {
+              using to_vector = typename cast_vector<float>::type;
+              const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+              const auto vf = vcast<to_vector, from_vector>(vin);
+              const auto vout = vquantize(vf, qinfo_out);
+              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::U32:
+            {
+              using to_vector = typename cast_vector<uint32_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::S32:
+            {
+              using to_vector = typename cast_vector<int32_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::F32:
+            {
+              using to_vector = typename cast_vector<float>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+              break;
+            }
+            default:
+              ARM_COMPUTE_ERROR("Unsupported data type.");
+          }
+        }
+
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          FromT val = get_value(in_ptr + x);
+          switch (output->info()->data_type())
+          {
+            case DataType::U8:
+            {
+              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+              break;
+            }
+            case DataType::QASYMM8:
+            {
+              const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+              const auto qval = qinfo_out.quantize(static_cast<float>(val), rounding_policy);
+              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
+              break;
+            }
+            case DataType::U32:
+            {
+              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+              break;
+            }
+            case DataType::S32:
+            {
+              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+              break;
+            }
+            case DataType::F32:
+            {
+              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+              break;
+            }
+            default:
+              ARM_COMPUTE_ERROR("Unsupported data type.");
+          }
+        }
+      },
+      in, out);
+}
+
+void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+  const int window_step_x = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  // Create iterators
+  Iterator in(input, win_collapsed);
+  Iterator out(output, win_collapsed);
+
+#ifdef __aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+  const auto &qinfo_in = input->info()->quantization_info();
+  const auto &qinfo_out = output->info()->quantization_info();
+
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &) {
+        const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
+
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+          using from_vector = typename cast_vector<float>::type;
+          const auto vf = wrapper::vloadq(in_ptr + x);
+          const auto vin = vdequantize(vf, qinfo_in);
+          switch (output->info()->data_type())
+          {
+            case DataType::U8:
+            {
+              using to_vector = typename cast_vector<uint8_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::QASYMM8:
+            {
+              using to_vector = typename cast_vector<float>::type;
+              const auto vf = vcast<to_vector, from_vector>(vin);
+              const auto vout = vquantize(vf, qinfo_out);
+              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::U32:
+            {
+              using to_vector = typename cast_vector<uint32_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::S32:
+            {
+              using to_vector = typename cast_vector<int32_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::F32:
+            {
+              using to_vector = typename cast_vector<float>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+              break;
+            }
+            default:
+              ARM_COMPUTE_ERROR("Unsupported data type.");
+          }
+        }
+
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          qasymm8_t qval_in = *(in_ptr + x);
+          const auto val = qinfo_in.dequantize(qval_in);
+
+          switch (output->info()->data_type())
+          {
+            case DataType::U8:
+            {
+              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+              break;
+            }
+            case DataType::QASYMM8:
+            {
+              const auto qval_out = qinfo_out.quantize(val, rounding_policy);
+              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
+              break;
+            }
+            case DataType::U32:
+            {
+              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+              break;
+            }
+            case DataType::S32:
+            {
+              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+              break;
+            }
+            case DataType::F32:
+            {
+              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+              break;
+            }
+            default:
+              ARM_COMPUTE_ERROR("Unsupported data type.");
+          }
+        }
+      },
+      in, out);
+}
+} // namespace
+
+NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
+{
+}
+
+void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
+
+  _input = input;
+  _output = output;
+  _input_subtype = input_subtype;
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                              SubDataType input_subtype)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+  return Status{};
+}
+
+void NECastKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  switch (_input->info()->data_type())
+  {
+    case DataType::U8:
+      if (_input_subtype == SubDataType::BOOL)
+      {
+        run_cast<bool>(_input, _output, window);
+      }
+      else
+      {
+        run_cast<uint8_t>(_input, _output, window);
+      }
+      break;
+    case DataType::QASYMM8:
+      run_cast_qasymm8(_input, _output, window);
+      break;
+    case DataType::U32:
+      run_cast<uint32_t>(_input, _output, window);
+      break;
+    case DataType::S32:
+      run_cast<int32_t>(_input, _output, window);
+      break;
+    case DataType::F32:
+      run_cast<float>(_input, _output, window);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
new file mode 100644
index 000000000..8a2223c26
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
+
+  const DataLayout data_layout = input->data_layout();
+  const int idx_channel =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+                              0);
+  // Validate output if initialized
+  if (output->total_size() != 0)
+  {
+    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                (block_shape * input->tensor_shape()[idx_width]));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                (block_shape * input->tensor_shape()[idx_height]));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  return Status{};
+}
+} // namespace
+
+NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
+    : _input(nullptr), _output(nullptr), _block_shape()
+{
+}
+
+void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
+                                            int32_t block_shape)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
+  // Output auto inizialitation if not yet initialized
+  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+
+  _input = input;
+  _output = output;
+  _block_shape = block_shape;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+  ICPPKernel::configure(win);
+}
+
+Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                             int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+  return Status{};
+}
+
+void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+  const int idx_channel =
+      get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+  const int depth_size = _input->info()->dimension(idx_channel);
+  const int r = (depth_size / (_block_shape * _block_shape));
+  const int element_size = _input->info()->element_size();
+
+  Window slice_out = window.first_slice_window_3D();
+
+  // The slice_out slice does not move
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+  // Main loop for NCHW and NHWC
+  if (_input->info()->data_layout() == DataLayout::NCHW)
+  {
+    Window slice_in = window.first_slice_window_2D();
+    do
+    {
+      Iterator in(_input, slice_in);
+      execute_window_loop(slice_in,
+                          [&](const Coordinates &id) {
+                            const int x = id.x();
+                            const int y = id.y();
+
+                            const int z = id.z() % r;
+                            const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
+                            const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
+                            Coordinates output_coords{out_x, out_y, z, id[3]};
+                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                          },
+                          in);
+    } while (window.slide_window_slice_2D(slice_in));
+  }
+  else
+  {
+    Window slice_in = window.first_slice_window_3D();
+    do
+    {
+      Iterator in(_input, slice_in);
+      execute_window_loop(slice_in,
+                          [&](const Coordinates &id) {
+                            const int x = id.y();
+                            const int y = id.z();
+
+                            const int z = id.x() % r;
+                            const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
+                            const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
+                            Coordinates output_coords{z, out_x, out_y, id[3]};
+                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                          },
+                          in);
+    } while (window.slide_window_slice_3D(slice_in));
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
new file mode 100644
index 000000000..cebd614df
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+template <ElementWiseUnaryEx op, typename ScalarType>
+inline ScalarType elementwise_op_scalar(const ScalarType &a)
+{
+  switch (op)
+  {
+    case ElementWiseUnaryEx::NEG:
+      return -a;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+template <ElementWiseUnaryEx op, typename VectorType>
+inline VectorType elementwise_op(const VectorType &a)
+{
+  switch (op)
+  {
+    case ElementWiseUnaryEx::NEG:
+      return wrapper::vneg(a);
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+template <ElementWiseUnaryEx op, typename ScalarType>
+void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
+{
+  const int window_step_x = 16 / sizeof(ScalarType);
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  Window win = window;
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator input(in, win);
+  Iterator output(out, win);
+
+  execute_window_loop(win,
+                      [&](const Coordinates &) {
+                        auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                        const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+
+                        int x = window_start_x;
+                        for (; x <= window_end_x - window_step_x; x += window_step_x)
+                        {
+                          wrapper::vstore(output_ptr + x,
+                                          elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
+                        }
+                        for (; x < window_end_x; ++x)
+                        {
+                          *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
+                        }
+                      },
+                      input, output);
+}
+
+template <ElementWiseUnaryEx op>
+std::function<void(const ITensor *input, ITensor *output, const Window &window)>
+configure_func(const ITensor *input, ITensor *output)
+{
+  std::string function_to_call("op_");
+  function_to_call += string_from_data_type(input->info()->data_type()) + "_";
+  function_to_call += string_from_data_type(output->info()->data_type());
+
+  static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
+      map_function = {
+          {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
+      };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+  auto it = map_function.find(function_to_call);
+
+  if (it != map_function.end())
+  {
+    auto func = it->second;
+    return [func](const ITensor *input, ITensor *output, const Window &window) {
+      func(input, output, window);
+    };
+  }
+  return nullptr;
+}
+} // namespace
+
+NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
+    : _function(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
+                                           ITensor *output)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Configure kernel window
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input->info());
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+
+  Window win = calculate_max_window(valid_region);
+
+  _input = input;
+  _output = output;
+
+  INEKernel::configure(win);
+
+  switch (op)
+  {
+    case ElementWiseUnaryEx::NEG:
+      _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
+                                                      const ITensorInfo &output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
+                                                       DataType::S32);
+
+  // Validate in case of configured output
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+  }
+
+  return Status{};
+}
+
+Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+                                            const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(op);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+  return Status{};
+}
+
+void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+  ARM_COMPUTE_ERROR_ON(_function == nullptr);
+  _function(_input, _output, window);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..5401afea0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
+    : _input(nullptr), _lookups(nullptr), _output(nullptr)
+{
+}
+
+void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output,
+                                        const ITensor *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _input = input;
+  _output = output;
+  _lookups = lookups;
+
+  // Auto initialize output if not initialized
+  auto out_shape = input->info()->tensor_shape();
+  out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions());
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input,
+                                         const arm_compute::ITensorInfo *output,
+                                         const arm_compute::ITensorInfo *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+    for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
+    {
+      ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+  }
+
+  return Status{};
+}
+
+void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  const size_t lookup_dim = _output->info()->num_dimensions() - 1;
+
+  Window output_window{window};
+  output_window.set(Window::DimX,
+                    Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                      _input->info()->dimension(0)));
+
+  Window out_slice = output_window.first_slice_window_4D();
+  do
+  {
+    Iterator output_it(_output, out_slice);
+
+    execute_window_loop(out_slice,
+                        [&](const Coordinates &id) {
+                          const int32_t lookup = *reinterpret_cast<int32_t *>(
+                              _lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
+                          Coordinates input_id{id};
+                          input_id.set(lookup_dim, lookup);
+                          memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+                                 _output->info()->dimension(0) * _output->info()->element_size());
+                        },
+                        output_it);
+
+  } while (window.slide_window_slice_4D(out_slice));
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
new file mode 100644
index 000000000..ce2413dc1
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Validate the indices
+ *
+ * Validate that indices are not negative
+ *
+ * @param[in] indices Indices tensor info.
+ */
+template <typename U> void validate_indices(const ITensor *indices)
+{
+  for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
+  {
+    ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
+  }
+}
+
+} // namespace
+
+NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {}
+
+template <typename U>
+inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+
+  // Validate that the indices are not negative
+  validate_indices<U>(_indices);
+
+  Iterator output_it(_output, window);
+  execute_window_loop(
+      window,
+      [&](const Coordinates &id) {
+        Coordinates gather_id(id);
+        gather_id.collapse(_indices->info()->num_dimensions(), 0);
+
+        U new_index;
+        switch (_indices->info()->num_dimensions())
+        {
+          case 1:
+            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+            break;
+          case 2:
+            new_index =
+                *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
+            break;
+          case 3:
+            new_index = *(
+                reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
+            break;
+          default:
+            ARM_COMPUTE_ERROR("Wrong num of dimensions");
+            break;
+        }
+
+        gather_id.set(0, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
+                    output_it.ptr());
+      },
+      output_it);
+}
+
+template <typename U>
+void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+
+  // Validate that the indices are not negative
+  validate_indices<U>(_indices);
+
+  Window output_window{window};
+  output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator output_it(_output, output_window);
+  execute_window_loop(
+      output_window,
+      [&](const Coordinates &id) {
+        Coordinates gather_id(id);
+        gather_id.collapse(_indices->info()->num_dimensions(), _axis);
+
+        U new_index;
+        switch (_indices->info()->num_dimensions())
+        {
+          case 1:
+            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+            break;
+          case 2:
+            new_index = *(reinterpret_cast<U *>(
+                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
+            break;
+          case 3:
+            new_index = *(reinterpret_cast<U *>(
+                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
+            break;
+          default:
+            ARM_COMPUTE_ERROR("Wrong num of dimensions");
+            break;
+        }
+
+        gather_id.set(_axis, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id),
+                    _input->info()->dimension(0) * _output->info()->element_size(),
+                    output_it.ptr());
+      },
+      output_it);
+}
+
+void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
+                                 int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+  ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  _input = input;
+  _indices = indices;
+  _output = output;
+  _axis = axis;
+
+  if (_axis < 0)
+  {
+    _axis += input->info()->num_dimensions();
+  }
+  ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
+
+  if (0 == _axis)
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEGatherKernelEx::gather_0_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEGatherKernelEx::gather_0_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  else
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEGatherKernelEx::gather_n_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEGatherKernelEx::gather_n_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  // Output auto initialization if not yet initialized
+  TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+      input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+  // Create window
+  Window win = calculate_max_window(*output->info(), Steps());
+  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+  INEKernel::configure(win);
+}
+
+Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                                  const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+
+  if (axis < 0)
+  {
+    axis += input->num_dimensions();
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+        input->tensor_shape(), indices->tensor_shape(), axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+  return Status{};
+}
+
+void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+  (this->*_func)(window, info);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
new file mode 100644
index 000000000..391337bfb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <unordered_map>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr size_t NOT_HIT = 0xFFFFFFFF;
+} // namespace
+
+NEHashtableLookupKernel::NEHashtableLookupKernel()
+    : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
+{
+}
+
+void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys,
+                                        const ITensor *input, ITensor *output, ITensor *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+
+  _lookups = lookups;
+  _keys = keys;
+  _input = input;
+  _output = output;
+  _hits = hits;
+
+  // Auto initialize output if not initialized
+  auto out_shape{input->info()->tensor_shape()};
+  out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false);
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  // Auto initialize hits if not initialized
+  auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8);
+
+  INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                         const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1));
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+    for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
+    {
+      ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+  }
+
+  // Validate in case of configured hits
+  if (hits->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1));
+    ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0));
+    ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+  }
+
+  return Status{};
+}
+
+void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  const size_t lookup_dim = _output->info()->num_dimensions() - 1;
+  const int const_0 = _output->info()->data_type() == DataType::QASYMM8
+                          ? _output->info()->quantization_info().offset
+                          : 0;
+
+  std::unordered_map<int32_t, size_t> key_index_map;
+  for (size_t n = 0; n < _keys->info()->dimension(0); ++n)
+  {
+    const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n}));
+    key_index_map[key] = n;
+  }
+  std::vector<size_t> lookup_indices;
+  for (size_t k = 0; k < _lookups->info()->dimension(0); ++k)
+  {
+    const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k}));
+    const auto it = key_index_map.find(key);
+    if (it == key_index_map.end())
+    {
+      lookup_indices.emplace_back(NOT_HIT);
+      *_hits->ptr_to_element({k}) = 0;
+    }
+    else
+    {
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+      if (it->second >= _keys->info()->dimension(0))
+        ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds.");
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+      lookup_indices.emplace_back(it->second);
+      *_hits->ptr_to_element({k}) = 1;
+    }
+  }
+
+  Window output_window{window};
+  output_window.set(Window::DimX,
+                    Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                      _input->info()->dimension(0)));
+
+  Window out_slice = output_window.first_slice_window_4D();
+  do
+  {
+    Iterator output_it(_output, out_slice);
+
+    execute_window_loop(out_slice,
+                        [&](const Coordinates &id) {
+                          const auto lookup = lookup_indices.at(id[lookup_dim]);
+                          if (lookup == NOT_HIT)
+                          {
+                            memset(output_it.ptr(), const_0,
+                                   _output->info()->dimension(0) * _output->info()->element_size());
+                          }
+                          else
+                          {
+                            Coordinates input_id{id};
+                            input_id.set(lookup_dim, lookup);
+                            memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+                                   _output->info()->dimension(0) * _output->info()->element_size());
+                          }
+
+                        },
+                        output_it);
+
+  } while (window.slide_window_slice_4D(out_slice));
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
new file mode 100644
index 000000000..1ea77fb5c
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+                                 float epsilon, const Window &window)
+{
+  /** NEON vector tag type. */
+  using ExactTagType =
+      typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+  // Clear X/Y dimensions on execution window as we handle the planes manually
+  Window win = window;
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+  win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  constexpr int window_step_x = 16 / sizeof(T);
+  const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+  const auto channel_idx =
+      get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+
+  Iterator input_it(input, win);
+  execute_window_loop(
+      win,
+      [&](const Coordinates &id) {
+        Window win_plane = window;
+        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+        Iterator input_plane_it(input, win_plane);
+        Iterator output_plane_it(output, win_plane);
+
+        auto sum_h_w = static_cast<T>(0.f);
+        auto sum_squares_h_w = static_cast<T>(0.f);
+
+        execute_window_loop(
+            win_plane,
+            [&](const Coordinates &) {
+              const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+              auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+              auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+              // Compute S elements per iteration
+              int x = window.x().start();
+              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+              {
+                auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
+                vec_sum_squares_h_w =
+                    wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+              }
+
+              auto vec2_sum_h_w =
+                  wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+              auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
+                                                         wrapper::vgetlow(vec_sum_squares_h_w));
+              for (int i = 0; i < window_step_x / 4; ++i)
+              {
+                vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+              }
+              sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+              sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+              // Compute left-over elements
+              for (; x < window.x().end(); ++x)
+              {
+                const auto value = *(input_ptr + x);
+                sum_h_w += value;
+                sum_squares_h_w += value * value;
+              }
+            },
+            input_plane_it, output_plane_it);
+
+        const auto mean_h_w = sum_h_w / elements_plane;
+        const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+        auto gamma_val = 1.0f;
+        if (gamma != nullptr)
+        {
+          gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
+        }
+        const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
+        const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
+        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
+        auto beta_val = 0.0f;
+        if (beta != nullptr)
+        {
+          beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
+        }
+        const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
+
+        execute_window_loop(
+            win_plane,
+            [&](const Coordinates &) {
+              auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
+              auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+              // Compute S elements per iteration
+              int x = window.x().start();
+              auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+              {
+                vec_val = wrapper::vloadq(input_ptr + x);
+                vec_val = wrapper::vadd(
+                    wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
+                wrapper::vstore(output_ptr + x, vec_val);
+              }
+
+              // Compute left-over elements
+              for (; x < window.x().end(); ++x)
+              {
+                *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
+              }
+            },
+            input_plane_it, output_plane_it);
+      },
+      input_it);
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+                                  "NHWC data layout is not supported by the kernel directly");
+
+  if (output != nullptr && output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                    "Input and output have different number of channels");
+  }
+
+  if (gamma != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
+                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                        gamma->dimension(0),
+                                    "Gamma's size must be the same as size of input's channel");
+  }
+
+  if (beta != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
+                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                        beta->dimension(0),
+                                    "Beta's size must be the same as size of input's channel");
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // We handle the planes manually
+  Window win = calculate_max_window(*input, Steps(1));
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+
+  // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
+  // skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
+      _epsilon(1e-12)
+{
+}
+
+void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output,
+                                                     ITensor *gamma, ITensor *beta, float epsilon)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _input = input;
+  _output = output == nullptr ? input : output;
+  _gamma = gamma;
+  _beta = beta;
+  _epsilon = epsilon;
+
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    _func = &instance_normalization_nchw<float>;
+  }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  else if (_input->info()->data_type() == DataType::F16)
+  {
+    _func = &instance_normalization_nchw<float16_t>;
+  }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  else
+  {
+    ARM_COMPUTE_ERROR("Unsupported data type");
+  }
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(_input->info(), _output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
+                                                      const ITensorInfo *output,
+                                                      const ITensorInfo *gamma,
+                                                      const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+  return Status{};
+}
+
+void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+  (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
new file mode 100644
index 000000000..de218d489
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                          const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+  // Checks performed when output is configured
+  if ((output->total_size() != 0))
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+inline int32x4x4_t load_value(const int32_t *input_ptr)
+{
+  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+  return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v)
+{
+  ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v)
+{
+  wrapper::vstore(ptr, v.val[0]);
+  wrapper::vstore(ptr + 4, v.val[1]);
+  wrapper::vstore(ptr + 8, v.val[2]);
+  wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
+{
+  wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+  wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale)
+{
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+  const float32x4x4_t ret = {{
+      vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
+      vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
+  }};
+  return ret;
+}
+} // namespace
+
+NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel()
+    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+{
+}
+
+void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor,
+                                            ITensor *output, float multiplier)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), scale_factor->info(), output->info()));
+
+  _input = input;
+  _scale_factor = scale_factor;
+  _output = output;
+  _multiplier = multiplier;
+
+  // Configure kernel window
+  Window win_config = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  INEKernel::configure(win_config);
+}
+
+Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
+                                             const ITensorInfo *scale_factor,
+                                             const ITensorInfo *output, float multiplier)
+{
+  ARM_COMPUTE_UNUSED(multiplier);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
+
+  return Status{};
+}
+
+template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window)
+{
+  constexpr auto window_step = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  // Support Only 2D input
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &id) {
+        auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
+        scale *= _multiplier;
+
+        const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
+        auto output_ptr = reinterpret_cast<T *>(output.ptr());
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step); x += window_step)
+        {
+          store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
+        }
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          output_ptr[x] = input_ptr[x] * scale;
+        }
+      },
+      input, output);
+}
+
+void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  switch (_output->info()->data_type())
+  {
+    case DataType::F32:
+      NEMultiplyScaleFactorKernel::multiply<float>(window);
+      break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+      NEMultiplyScaleFactorKernel::multiply<float16_t>(window);
+      break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
new file mode 100644
index 000000000..ad1bb9051
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+namespace
+{
+
+/** Conditional element-wise operations */
+enum class ConditionalOperation
+{
+  PRELU, /**< (x * y) for x < 0, x for x >= 0 */
+};
+
+template <ConditionalOperation op, typename ScalarType>
+inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+  auto res = ScalarType(0);
+
+  switch (op)
+  {
+    case ConditionalOperation::PRELU:
+      res = a < 0 ? a * b : a;
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <ConditionalOperation op>
+inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
+                                                           QuantizationInfo qinfo)
+{
+  return qinfo.quantize(elementwise_conditional_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP);
+}
+
+template <ConditionalOperation op, typename VectorType>
+inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
+{
+  VectorType res = {0, 0, 0, 0};
+  VectorType const_0 = {0, 0, 0, 0};
+
+  switch (op)
+  {
+    case ConditionalOperation::PRELU:
+      res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
+      ;
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <ConditionalOperation op>
+inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+  float32x4x4_t out = {{
+      elementwise_conditional_op<op>(a.val[0], b.val[0]),
+      elementwise_conditional_op<op>(a.val[1], b.val[1]),
+      elementwise_conditional_op<op>(a.val[2], b.val[2]),
+      elementwise_conditional_op<op>(a.val[3], b.val[3]),
+  }};
+  return out;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
+                                                       const ScalarType &broadcast_value,
+                                                       const bool reorder)
+{
+  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+  return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
+                                        reorder ? a : broadcast_vector);
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
+                                           const ScalarType *input1_ptr,
+                                           const ScalarType *input2_ptr, ScalarType *output_ptr)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq(input1_ptr + x);
+    const auto b = wrapper::vloadq(input2_ptr + x);
+    wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
+  }
+  return x;
+}
+
+template <ConditionalOperation op>
+inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
+                                                     int window_step_x, const uint8_t *input1_ptr,
+                                                     const uint8_t *input2_ptr, uint8_t *output_ptr,
+                                                     int32x4_t voffset1, int32x4_t voffset2,
+                                                     float32x4_t vscale1, float32x4_t vscale2,
+                                                     float32x4_t voffseto, float32x4_t invvscaleo)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    // Get inputs and compute output
+    const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+    const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+    const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
+    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+  }
+  return x;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
+                                                     int window_step_x,
+                                                     const ScalarType *non_broadcast_input_ptr,
+                                                     const ScalarType &broadcast_value,
+                                                     ScalarType *output_ptr, const bool reorder)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+    wrapper::vstore(output_ptr + x,
+                    elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
+  }
+  return x;
+}
+
+template <ConditionalOperation op>
+inline int elementwise_conditional_op_quantized_broadcast_loop(
+    int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
+    float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
+    float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const float32x4x4_t af =
+        load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+    const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
+                                                            reorder ? af : broadcast_vector);
+    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+  }
+  return x;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+                                const Window &window)
+{
+  elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
+                 &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
+                 &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
+}
+
+template <ConditionalOperation op>
+void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
+                                          const Window &window)
+{
+  elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
+                           &elementwise_conditional_op_quantized_broadcast_loop<op>,
+                           &elementwise_conditional_op_quantized_loop<op>);
+}
+} // namespace
+
+NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
+
+  // Configure kernel window
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+
+  Window win = calculate_max_window(valid_region);
+
+  _input = input;
+  _alpha = alpha;
+  _output = output;
+  INEKernel::configure(win);
+}
+
+void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
+                                                                                _output, window);
+  }
+  else if (_input->info()->data_type() == DataType::QASYMM8)
+  {
+    elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
+                                                                      window);
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Wrong Type");
+  }
+}
+
+Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+                                         const ITensorInfo &output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
+
+  const TensorShape out_shape =
+      TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  // Checks performed when output is configured
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+
+  return Status{};
+}
+
+Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
+                               const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
+
+  return Status{};
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
new file mode 100644
index 000000000..acf0092eb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *scale_factor)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+  return Status{};
+}
+
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+  return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+inline float32x4_t round(const float32x4_t &fv)
+{
+  const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f);
+  const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f);
+  // If value < 0, mask = -1, else mask = 0
+  int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4));
+  return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4));
+}
+
+inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale)
+{
+  const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv);
+  const int32x4_t vposend = vdupq_n_s32(max_scale);
+  const int32x4_t vnagend = vdupq_n_s32(-max_scale);
+
+  const int32x4x4_t rf = {{
+#ifdef __aarch64__
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+#else  //__aarch64__
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+#endif //__aarch64__
+  }};
+  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+  return vcombine_s8(pa, pb);
+}
+} // namespace
+
+NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel()
+    : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
+{
+}
+
+void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output,
+                                              ITensor *scale_factor)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), output->info(), scale_factor->info()));
+
+  _input = input;
+  _output = output;
+  _scale_factor = scale_factor;
+
+  // Configure kernel window
+  Window win_config = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  INEKernel::configure(win_config);
+}
+
+Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                               const ITensorInfo *scale_factor)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor));
+
+  return Status{};
+}
+
+template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window)
+{
+  constexpr auto window_step = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+#ifdef __aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP;
+#endif //__aarch64__
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  // Support Only 2D input
+  Window win_collapsed = window;
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+  const auto dim_x = _input->info()->dimension(0);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &id) {
+        const auto start = reinterpret_cast<const T *>(input.ptr());
+        const auto min_max = std::minmax_element(start, start + dim_x);
+        const auto int8_scale = 127;
+        auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
+        if (range == 0)
+        {
+          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
+          range = 1;
+        }
+        else
+        {
+          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
+        }
+        const auto scale_factor_inv = int8_scale / range;
+
+        auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+        auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step); x += window_step)
+        {
+          wrapper::vstore(&output_ptr[x],
+                          vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
+        }
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
+          quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
+          output_ptr[x] = static_cast<int8_t>(quantized);
+        }
+      },
+      input, output);
+}
+
+void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  switch (_input->info()->data_type())
+  {
+    case DataType::F32:
+      NEQuantizationSymmetricKernel::quantize<float>(window);
+      break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+      NEQuantizationSymmetricKernel::quantize<float16_t>(window);
+      break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
new file mode 100644
index 000000000..59e7d9beb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+float32x2_t calculate_min(float32x4_t in)
+{
+  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  return wrapper::vpmin(pmin, pmin);
+}
+
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+float32x2_t calculate_max(float32x4_t in)
+{
+  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  return wrapper::vpmax(pmax, pmax);
+}
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+int32x2_t calculate_min(int32x4_t in)
+{
+  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  return wrapper::vpmin(pmin, pmin);
+}
+
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+int32x2_t calculate_max(int32x4_t in)
+{
+  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  return wrapper::vpmax(pmax, pmax);
+}
+
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+inline uint8x8_t calculate_min(uint8x16_t in)
+{
+  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  pmin = wrapper::vpmin(pmin, pmin);
+  pmin = wrapper::vpmin(pmin, pmin);
+  return wrapper::vpmin(pmin, pmin);
+}
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+inline uint8x8_t calculate_max(uint8x16_t in)
+{
+  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  pmax = wrapper::vpmax(pmax, pmax);
+  pmax = wrapper::vpmax(pmax, pmax);
+  return wrapper::vpmax(pmax, pmax);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+inline float16x4_t calculate_min(float16x8_t in)
+{
+  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  pmin = wrapper::vpmin(pmin, pmin);
+  return wrapper::vpmin(pmin, pmin);
+}
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+inline float16x4_t calculate_max(float16x8_t in)
+{
+  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  pmax = wrapper::vpmax(pmax, pmax);
+  return wrapper::vpmax(pmax, pmax);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <class F> class Reducer
+{
+public:
+  static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f,
+                      const ReduceOperation op)
+  {
+    // Set out window
+    Window out_window(window);
+    out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    // Get first input and output slices
+    Window in_slice = window.first_slice_window_1D();
+    Window out_slice = out_window.first_slice_window_1D();
+
+    do
+    {
+      Iterator in(input, in_slice);
+      Iterator out(output, out_slice);
+
+      f(in, out, in_slice, out_slice, *input->info(), op);
+    } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+  }
+  static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f,
+                      const ReduceOperation op)
+  {
+    // Set in window
+    Window in_window(window);
+    Window out_window(window);
+
+    in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1),
+                                                   output->info()->dimension(1)));
+
+    // Get first input and output slices
+    Window in_slice = in_window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    do
+    {
+      Iterator in(input, in_slice);
+      Iterator out(output, out_slice);
+
+      f(in, out, in_slice, out_slice, *input->info(), 1, op);
+    } while (in_window.slide_window_slice_2D(in_slice) &&
+             out_window.slide_window_slice_2D(out_slice));
+  }
+  static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f,
+                      const ReduceOperation op)
+  {
+    // Set in window
+    Window in_window(window);
+    Window out_window(window);
+
+    in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2),
+                                                   output->info()->dimension(2)));
+
+    // Get first input and output slices
+    Window in_slice = in_window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_3D();
+
+    do
+    {
+      Iterator in(input, in_slice);
+      Iterator out(output, out_slice);
+
+      f(in, out, in_slice, out_slice, *input->info(), 2, op);
+    } while (in_window.slide_window_slice_3D(in_slice) &&
+             out_window.slide_window_slice_3D(out_slice));
+  }
+  static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f,
+                      const ReduceOperation op)
+  {
+    // Set in/out window
+    Window in_window(window);
+    Window out_window(window);
+
+    in_window.set(3, Window::Dimension(0, 1, 1));
+    out_window.set(3, Window::Dimension(0, 1, 1));
+
+    // Get first input and output slices
+    Window in_slice = in_window.first_slice_window_4D();
+    Window out_slice = out_window.first_slice_window_4D();
+
+    do
+    {
+      Iterator in(input, in_slice);
+      Iterator out(output, out_slice);
+
+      f(in, out, in_slice, out_slice, *input->info(), 3, op);
+    } while (in_window.slide_window_slice_4D(in_slice) &&
+             out_window.slide_window_slice_4D(out_slice));
+  }
+};
+
+template <typename T, int S> struct RedOpX
+{
+  /** NEON vector tag type. */
+  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+                         const TensorInfo &in_info, const ReduceOperation op)
+  {
+    ARM_COMPUTE_UNUSED(out_slice);
+    ARM_COMPUTE_UNUSED(in_info);
+    auto init_res_value = static_cast<T>(0.f);
+    switch (op)
+    {
+      case ReduceOperation::MIN:
+      case ReduceOperation::MAX:
+      {
+        init_res_value = *reinterpret_cast<T *>(input.ptr());
+        break;
+      }
+      default:
+        break;
+    }
+    auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+
+    execute_window_loop(in_slice,
+                        [&](const Coordinates &) {
+                          const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
+                          const auto vec_elements = wrapper::vloadq(in_ptr);
+
+                          switch (op)
+                          {
+                            case ReduceOperation::MIN:
+                            {
+                              vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                              break;
+                            }
+                            case ReduceOperation::MAX:
+                            {
+                              vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                              break;
+                            }
+                            default:
+                              ARM_COMPUTE_ERROR("Not supported");
+                          }
+                        },
+                        input);
+
+    switch (op)
+    {
+      case ReduceOperation::MIN:
+      {
+        *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0);
+        break;
+      }
+      case ReduceOperation::MAX:
+      {
+        *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0);
+        break;
+      }
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+    }
+  }
+};
+
+struct RedOpX_qasymm8
+{
+  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+                         const TensorInfo &in_info, const ReduceOperation op)
+  {
+    ARM_COMPUTE_UNUSED(out_slice);
+    ARM_COMPUTE_UNUSED(in_info);
+
+    uint8x16_t vec_res_value = {0};
+
+    if (op == ReduceOperation::MIN || op == ReduceOperation::MAX)
+    {
+      vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+    }
+
+    execute_window_loop(in_slice,
+                        [&](const Coordinates &) {
+                          const auto vec_elements = wrapper::vloadq(input.ptr());
+                          switch (op)
+                          {
+                            case ReduceOperation::MIN:
+                            {
+                              vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                              break;
+                            }
+                            case ReduceOperation::MAX:
+                            {
+                              vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                              break;
+                            }
+                            default:
+                              ARM_COMPUTE_ERROR("Not supported");
+                          }
+                        },
+                        input);
+
+    switch (op)
+    {
+      case ReduceOperation::MIN:
+      {
+        *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+        break;
+      }
+      case ReduceOperation::MAX:
+      {
+        *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+        break;
+      }
+      default:
+      {
+        ARM_COMPUTE_ERROR("Not supported");
+      }
+    }
+  }
+};
+
+template <typename T, int S> struct RedOpYZW
+{
+  /** NEON vector tag type. */
+  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+  using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
+
+  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+                         const TensorInfo &in_info, int axis, const ReduceOperation op)
+  {
+    ARM_COMPUTE_UNUSED(out_slice);
+
+    execute_window_loop(
+        in_slice,
+        [&](const Coordinates &) {
+          neon_vector vec_res_value = {0};
+          switch (op)
+          {
+            case ReduceOperation::MIN:
+            case ReduceOperation::MAX:
+            {
+              vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+              break;
+            }
+            default:
+            {
+              vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+              break;
+            }
+          }
+
+          for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+          {
+            T *in_ptr;
+            switch (axis)
+            {
+              case 1:
+                in_ptr = reinterpret_cast<T *>(
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
+                break;
+              case 2:
+                in_ptr = reinterpret_cast<T *>(
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
+                break;
+              case 3:
+                in_ptr = reinterpret_cast<T *>(
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
+                break;
+              default:
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+            const auto vec_elements = wrapper::vloadq(in_ptr);
+
+            switch (op)
+            {
+              case ReduceOperation::MIN:
+              {
+                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                break;
+              }
+              case ReduceOperation::MAX:
+              {
+                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                break;
+              }
+              default:
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+          }
+          wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+        },
+        input, output);
+  }
+};
+
+struct RedOpYZW_qasymm8
+{
+  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+                         const TensorInfo &in_info, int axis, const ReduceOperation op)
+  {
+    ARM_COMPUTE_UNUSED(out_slice);
+
+    execute_window_loop(
+        in_slice,
+        [&](const Coordinates &) {
+          auto vec_res_value = wrapper::vloadq(input.ptr());
+
+          for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
+          {
+            uint8_t *in_ptr;
+            switch (axis)
+            {
+              case 1:
+                in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
+                break;
+              case 2:
+                in_ptr =
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
+                break;
+              case 3:
+                in_ptr =
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
+                break;
+              default:
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+            const auto vec_elements = wrapper::vloadq(in_ptr);
+
+            switch (op)
+            {
+              case ReduceOperation::MIN:
+              {
+                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                break;
+              }
+              case ReduceOperation::MAX:
+              {
+                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                break;
+              }
+              default:
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+          }
+          wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value);
+        },
+        input, output);
+  }
+};
+
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis,
+               const ReduceOperation op)
+{
+  const bool is_complex = (input->info()->num_channels() == 2);
+  if (is_complex)
+  {
+    ARM_COMPUTE_ERROR("Not supported");
+  }
+
+  switch (axis)
+  {
+    case 0:
+      switch (input->info()->data_type())
+      {
+        case DataType::QASYMM8:
+          return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+          return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output,
+                                                        RedOpX<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+          return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
+        case DataType::S32:
+          return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(),
+                                                      op);
+        default:
+          ARM_COMPUTE_ERROR("Not supported");
+      }
+    case 1:
+      switch (input->info()->data_type())
+      {
+        case DataType::QASYMM8:
+          return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+          return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output,
+                                                          RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+          return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(),
+                                                      op);
+        case DataType::S32:
+          return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output,
+                                                        RedOpYZW<int32_t, 4>(), op);
+        default:
+          ARM_COMPUTE_ERROR("Not supported");
+      }
+    case 2:
+      switch (input->info()->data_type())
+      {
+        case DataType::QASYMM8:
+          return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+          return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output,
+                                                          RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+          return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(),
+                                                      op);
+        case DataType::S32:
+          return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output,
+                                                        RedOpYZW<int32_t, 4>(), op);
+        default:
+          ARM_COMPUTE_ERROR("Not supported");
+      }
+    case 3:
+      switch (input->info()->data_type())
+      {
+        case DataType::QASYMM8:
+          return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+          return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output,
+                                                          RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+          return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(),
+                                                      op);
+        case DataType::S32:
+          return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output,
+                                                        RedOpYZW<int32_t, 4>(), op);
+        default:
+          ARM_COMPUTE_ERROR("Not supported");
+      }
+    default:
+      ARM_COMPUTE_ERROR("Unsupported reduction axis");
+  }
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
+                          ReduceOperation op)
+{
+  ARM_COMPUTE_UNUSED(op);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+
+  if (input->num_channels() == 1)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex");
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                  "Reduction axis greater than max number of dimensions");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
+
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+    const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                         unsigned int axis, ReduceOperation op)
+{
+  ARM_COMPUTE_UNUSED(op);
+
+  // Calculate output shape and set if empty
+  const TensorShape output_shape =
+      arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+
+  // Output auto initialization if not yet initialized
+  DataType output_data_type = input->data_type();
+  auto_init_if_empty(*output, input->clone()
+                                  ->set_tensor_shape(output_shape)
+                                  .set_data_type(output_data_type)
+                                  .reset_padding()
+                                  .set_is_resizable(true));
+
+  unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+
+  return std::make_tuple(err, win);
+}
+} // namespace
+
+NEReductionOperationKernelEx::NEReductionOperationKernelEx()
+    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX),
+      _border_size()
+{
+}
+
+BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; }
+
+void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output,
+                                             unsigned int axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+  unsigned int num_elems_processed_per_iteration =
+      16 / data_size_from_type(input->info()->data_type());
+
+  _input = input;
+  _output = output;
+  _border_size =
+      (axis == 0)
+          ? BorderSize(0, num_elems_processed_per_iteration -
+                              (input->info()->dimension(0) % num_elems_processed_per_iteration),
+                       0, 0)
+          : BorderSize();
+  _op = op;
+  _reduction_axis = axis;
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
+
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                              unsigned int axis, ReduceOperation op)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
+      validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
+
+  return Status{};
+}
+
+void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  reduce_op(window, _input, _output, _reduction_axis, _op);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
new file mode 100644
index 000000000..36a2f55a9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
+
+  // Validate output if initialized
+  if (output->total_size() != 0)
+  {
+    const DataLayout data_layout = input->data_layout();
+    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_channel =
+        get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int idx_batch =
+        get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
+                                output->tensor_shape()[idx_batch]);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+                                0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
+                                output->tensor_shape().total_size());
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  return Status{};
+}
+} // namespace
+
+NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
+    : _input(nullptr), _output(nullptr), _block_shape()
+{
+}
+
+void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
+                                            int32_t block_shape)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+
+  _input = input;
+  _block_shape = block_shape;
+  _output = output;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+  INEKernel::configure(win);
+}
+
+Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                             int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+  return Status{};
+}
+
+void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+  const DataLayout data_layout = _input->info()->data_layout();
+  const int channel_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+  const int element_size = _input->info()->element_size();
+
+  const size_t channel_size = _input->info()->dimension(channel_idx);
+
+  Window slice_out = window.first_slice_window_3D();
+
+  int batch_id = 0;
+
+  // Main loop for NCHW and NHWC
+  if (_output->info()->data_layout() == DataLayout::NCHW)
+  {
+    do
+    {
+      Iterator out(_output, slice_out);
+      execute_window_loop(slice_out,
+                          [&](const Coordinates &id) {
+                            const size_t channel_id = id.z();
+                            const size_t in_x =
+                                id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+                            const size_t in_y =
+                                id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+                            const int z = channel_id % channel_size;
+                            Coordinates input_coords{in_x, in_y, z, batch_id};
+                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                          },
+                          out);
+      ++batch_id;
+    } while (window.slide_window_slice_3D(slice_out));
+  }
+  else
+  {
+    do
+    {
+      Iterator out(_output, slice_out);
+      execute_window_loop(slice_out,
+                          [&](const Coordinates &id) {
+                            const size_t channel_id = id.x();
+                            const size_t in_x =
+                                id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+                            const size_t in_y =
+                                id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+                            const int z = channel_id % channel_size;
+                            Coordinates input_coords{z, in_x, in_y, batch_id};
+                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                          },
+                          out);
+      ++batch_id;
+    } while (window.slide_window_slice_3D(slice_out));
+  }
+}
+} // namespace arm_compute