9 files changed, 374 insertions, 1206 deletions
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
deleted file mode 100644
index 03717cfe9..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform arg_max/arg_min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
- *       e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as
- *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- * types:
- *                                                  U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension
- *                                                  (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension
- *                                                  (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension
- *                                                  (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element
- *                                                  in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension
- *                                                  (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image.
- *                                                  Supported data types: U32
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension
- *                                                  (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension
- *                                                  (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- * @param[in]  axis                                 Axis through which reduction occurs
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-
-__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
-                     const int dim)
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-  int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
-  };
-
-  DATA_TYPE value =
-      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-  DATA_TYPE tval = value;
-  int idx = 0;
-  for (int i = 1; i < dim; ++i)
-  {
-    indices[axis] = i;
-
-#if OP_CODE == 1 // ArgMax
-    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
-                                                               indices[2], indices[3])));
-#elif OP_CODE == 2 // ArgMin
-    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
-                                                               indices[2], indices[3])));
-#else
-    return;
-
-#endif
-
-    if (tval != value)
-    {
-      idx = indices[axis];
-      tval = value;
-    }
-  }
-
-  *((__global uint *)out.ptr) = idx;
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
deleted file mode 100644
index f74c1c103..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
- *  QASYMM8
- *
- * The following computations will be performed:
- *
- *  -# Add offset terms to inputs
-    -# Get scaled value of two inputs
- *  -# Add inputs
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using
- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The number of bits to shift left of input tensors must be passed at compile time using
- *            -DLEFT_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
- *            must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
- -DIN2_OFFSET,
- *            -RIN2_MULT_INT and -DIN2_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
- *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
- -DRESULT_SHIFT
- *
- * @attention The input and output data_types need to be passed at compile time using
- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
- *            using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
- *            e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
- * @attention The inputs and output scale offset need to be passed at compile time using
- *            -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
- *            e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
- *            wrapping policy will be used.
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor.
- *                                               Supported data types: QASYMM8
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension
- *                                               (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source
- *                                               tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types:
- *                                               QASYMM8
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension
- *                                               (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source
- *                                               tensor
- * @param[out] out_ptr                           Pointer to the destination tensor.
- *                                               Supported data types: QASYMM8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension
- *                                               (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
- *                                               tensor
- */
-__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
-                                     TENSOR3D_DECLARATION(out))
-{
-  // Get pixels pointer
-  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-  // Load data
-  VEC_DATA_TYPE(int, 16)
-  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
-  VEC_DATA_TYPE(int, 16)
-  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
-  // Get scaled value of two inputs
-  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
-  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-
-  VEC_DATA_TYPE(int, 16)
-  left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
-  VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
-  VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
-
-  VEC_DATA_TYPE(int, 16)
-  scaled_in1_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
-  VEC_DATA_TYPE(int, 16)
-  scaled_in2_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
-
-  // Add inputs and multiply with a multiplier smaller than 1
-  VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
-  VEC_DATA_TYPE(int, 16)
-  out_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
-  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
-  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
-  // TODO: Apply min-max BOUND to support fuse with relu.
-  /*
-  #if defined(MIN_BOUND)
-      res = max(res, (uchar16)MIN_BOUND);
-  #endif // defined(MIN_BOUND)
-  #if defined(MAX_BOUND)
-      res = min(res, (uchar16)MAX_BOUND);
-  #endif // defined(MAX_BOUND)
-  */
-
-  // Store result
-  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 4147a0017..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef SCALE
-#define SCALE 1.0f
-#endif
-#ifndef OFFSET
-#define OFFSET 0
-#endif
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
-/** Perform a cast operation on an input tensor.
- *
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention -DBOOL_INPUT : Whether type of input is bool.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
-           VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-   0, (__global DATA_TYPE_OUT *)output.ptr);
-  VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-  res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
-                VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#if defined(BOOL_INPUT)
-  VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
-  VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
-  res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#endif // defined(BOOL_INPUT)
-
-  VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 input tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of input should be given as a preprocessor argument using
- *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
-  VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
-  VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
-   (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 output tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of output should be given as a preprocessor argument using
- *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                 bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
-  VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
-  VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
-   (__global DATA_TYPE_OUT *)output.ptr);
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index 0285c955b..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- *            using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  out_index[0] = get_global_id(0);         // W
-  out_index[1] = get_global_id(1);         // H
-  out_index[2] = get_global_id(2) % Z_OUT; // C
-  out_index[3] = get_global_id(2) / Z_OUT; // B
-
-  in_index[0] = out_index[0] / BLOCK_SIZE;
-  in_index[1] = out_index[1] / BLOCK_SIZE;
-  in_index[2] = out_index[2] +
-                ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
-  in_index[3] = out_index[3];
-
-  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
-      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor (NHWC)
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- *            using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  out_index[0] = get_global_id(0);         // C
-  out_index[1] = get_global_id(1);         // W
-  out_index[2] = get_global_id(2) % Z_OUT; // H
-  out_index[3] = get_global_id(2) / Z_OUT; // B
-
-  in_index[0] = out_index[0] +
-                ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
-  in_index[1] = out_index[1] / BLOCK_SIZE;
-  in_index[2] = out_index[2] / BLOCK_SIZE;
-  in_index[3] = out_index[3];
-
-  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
-      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 2d0b6a299..e07a25ec9 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
@@ -59,16 +58,219 @@
 #pragma OPENCL EXTENSION cl_arm_printf : enable
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
 #define EXPAND(x) x
 
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
 
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each
+ * element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
 #define VLOAD_STR(size) vload##size
 #define VLOAD(size) VLOAD_STR(size)
 
 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)
 
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create
+// defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index a83b1a8a5..5f1b3f902 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,29 +37,112 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef ARM_COMPUTE_HELPERS_ASYMM_H
 #define ARM_COMPUTE_HELPERS_ASYMM_H
 
 #include "helpers.h"
 
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+  float out_f32 = input / scale + offset;
+  uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+  return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+  return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+  return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                 \
+  inline VEC_DATA_TYPE(type, size)                                                                \
+      quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)          \
+  {                                                                                               \
+    VEC_DATA_TYPE(float, size)                                                                    \
+    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
+    VEC_DATA_TYPE(type, size)                                                                     \
+    res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)),                        \
+                      VEC_DATA_TYPE(type, size));                                                 \
+    return res;                                                                                   \
+  }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                       \
+  inline VEC_DATA_TYPE(float, size)                                                       \
+      dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+  {                                                                                       \
+    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                 \
+  }
+
 /** Correctly-rounded-to-nearest division by a power-of-two.
  *
  * @param[in] size Size of vector.
  *
  * @return Correctly-rounded-to-nearest division by a power-of-two.
  */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                     \
-  inline VEC_DATA_TYPE(int, size)                                                    \
-      asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-  {                                                                                  \
-    VEC_DATA_TYPE(int, size)                                                         \
-    mask = (1 << exponent) - 1;                                                      \
-    const VEC_DATA_TYPE(int, size) zero = 0;                                         \
-    const VEC_DATA_TYPE(int, size) one = 1;                                          \
-    VEC_DATA_TYPE(int, size)                                                         \
-    threshold = (mask >> 1) + select(zero, one, x < 0);                              \
-    return (x >> exponent) + select(zero, one, (x & mask) > threshold);              \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                        \
+  inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
+      VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+  {                                                                     \
+    const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;  \
+    const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1;   \
+    VEC_DATA_TYPE(int, size)                                            \
+    mask = (one << exponent) - one;                                     \
+    VEC_DATA_TYPE(int, size)                                            \
+    threshold = (mask >> 1) + select(zero, one, x < 0);                 \
+    return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
   }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -81,9 +164,19 @@
     b_64 = convert_long##size(b);                                              \
     VEC_DATA_TYPE(long, size)                                                  \
     ab_64 = a_64 * b_64;                                                       \
-    /* COMPMID-907 */                                                          \
+    /* Revert COMPMID-907 */                                                   \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask1 = 1 << 30;                                                           \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask2 = 1 - (1 << 30);                                                     \
+    VEC_DATA_TYPE(long, size)                                                  \
+    is_positive_or_zero = ab_64 >= 0;                                          \
+    VEC_DATA_TYPE(long, size)                                                  \
+    nudge = select(mask2, mask1, is_positive_or_zero);                         \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask = 1ll << 31;                                                          \
     VEC_DATA_TYPE(int, size)                                                   \
-    ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));             \
+    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                  \
     return select(ab_x2_high32, INT_MAX, overflow);                            \
   }
 
@@ -335,9 +428,18 @@
     return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
   }
 
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) \
+  dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) \
+  DEQUANTIZE_STR(input, offset, scale, type, size)
+
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
   asymm_rounding_divide_by_POW2_##size(x, exponent)
 #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+  ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
   ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
 #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
@@ -360,11 +462,53 @@
 #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
   asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
 
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                               \
+  inline VEC_DATA_TYPE(int, size)                                                                 \
+      multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+  {                                                                                               \
+    const int left_shift = shift > 0 ? shift : 0;                                                 \
+    const int right_shift = shift > 0 ? 0 : -shift;                                               \
+    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),       \
+                                         right_shift, size);                                      \
+  }
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+  multiply_by_quantized_multiplier##size(input, qmul, shift)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
 
+ASYMM_MULT_IMPL(1)
 ASYMM_MULT_IMPL(2)
 ASYMM_MULT_IMPL(4)
 ASYMM_MULT_IMPL(8)
@@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
 
+ASYMM_SELECT_USING_MASK_IMPL(1)
 ASYMM_SELECT_USING_MASK_IMPL(2)
 ASYMM_SELECT_USING_MASK_IMPL(4)
 ASYMM_SELECT_USING_MASK_IMPL(8)
 ASYMM_SELECT_USING_MASK_IMPL(16)
 
+ASYMM_MASK_IF_ZERO_IMPL(1)
 ASYMM_MASK_IF_ZERO_IMPL(2)
 ASYMM_MASK_IF_ZERO_IMPL(4)
 ASYMM_MASK_IF_ZERO_IMPL(8)
 ASYMM_MASK_IF_ZERO_IMPL(16)
 
+ASYMM_MASK_IF_NON_ZERO_IMPL(1)
 ASYMM_MASK_IF_NON_ZERO_IMPL(2)
 ASYMM_MASK_IF_NON_ZERO_IMPL(4)
 ASYMM_MASK_IF_NON_ZERO_IMPL(8)
@@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
 
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
@@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
 
+ASYMM_RESCALE_IMPL(1)
 ASYMM_RESCALE_IMPL(2)
 ASYMM_RESCALE_IMPL(4)
 ASYMM_RESCALE_IMPL(8)
 ASYMM_RESCALE_IMPL(16)
 
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
+
 #endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
deleted file mode 100644
index 12c8eeb79..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
- *                                                   types : F16/F32
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                   image
- * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
- *                                                   types : F16/F32
- * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
- *                                                   image
- *
- * @param[out] output_ptr                            Pointer to the destination image. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
- *                                                   destination image
- */
-__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
-                    TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VSTORE(VEC_SIZE)
-  (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
-       ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
-             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
-       : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
-   0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
deleted file mode 100644
index a66e107d1..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
-    defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-#define SELECT_TYPE VEC_INT
-
-/** Returns result of prelu function implemented as below:
- *  f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
- *            -DDATA_TYPE_IN=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @note Can only take uchar data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
- *                                                   types : QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                   image
- * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
- *                                                   types : QASYMM8
- * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
- *                                                   image
- * @param[out] output_ptr                            Pointer to the destination image. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
- *                                                   destination image
- */
-__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
-                            TENSOR3D_DECLARATION(output))
-{
-  // Get pixels pointer
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
-  VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
-
-  in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
-  alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
-
-  const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
-  const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
-  const VEC_FLOAT outf32 =
-      select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
-  const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
-  const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
-  VSTORE(VEC_SIZE)
-  (res, 0, (__global uchar *)output.ptr);
-}
-
-#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
-       // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index eb612f834..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- *            e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- *            argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  in_index[0] = get_global_id(0);        // W
-  in_index[1] = get_global_id(1);        // H
-  in_index[2] = get_global_id(2) % Z_IN; // C
-  in_index[3] = get_global_id(2) / Z_IN; // B
-
-  out_index[0] = in_index[0] / BLOCK_SIZE;
-  out_index[1] = in_index[1] / BLOCK_SIZE;
-  out_index[2] =
-      in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
-  out_index[3] = in_index[3];
-
-  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
-                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-
-#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- *            e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- *            argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  in_index[0] = get_global_id(0);        // C
-  in_index[1] = get_global_id(1);        // W
-  in_index[2] = get_global_id(2) % Z_IN; // H
-  in_index[3] = get_global_id(2) / Z_IN; // B
-
-  out_index[0] =
-      in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
-  out_index[1] = in_index[1] / BLOCK_SIZE;
-  out_index[2] = in_index[2] / BLOCK_SIZE;
-  out_index[3] = in_index[3];
-
-  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
-                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)