35 files changed, 1902 insertions, 999 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
new file mode 100644
index 000000000..f54c7bde3
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#define CONST_ONE 1.f
+#define DIV_OP(a, b) ((a) / (b))
+#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a)))
+
+// Inverse Square-root Activation
+inline TYPE rsqrt_op(TYPE x)
+{
+    return RSQRT_OP(x);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+#if defined(ACT)
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer_ex(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+    data = ACTIVATION_OP(ACT, data);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+#endif /* defined(ACT) */
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
new file mode 100644
index 000000000..9a6921d7c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform arg_max/arg_min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: U32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  axis                                 Axis through which reduction occurs for max value index
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+
+__kernel void arg_op(TENSOR4D_DECLARATION(input),
+                     TENSOR4D_DECLARATION(output),
+		     const int axis,
+		     const int dim)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int indices[4] =
+    {
+        get_global_id(0),
+        get_global_id(1),
+        get_global_id(2) % DEPTH_OUT,
+        get_global_id(2) / DEPTH_OUT,
+    };
+
+    DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    DATA_TYPE tval = value;
+    int idx = 0;
+    for(int i = 1; i < dim; ++i)
+    {
+      indices[axis] = i;
+
+      #if OP_CODE == 1 // ArgMax
+        value = max(value, *((__global DATA_TYPE *)
+		             tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+      #elif OP_CODE == 2 //ArgMin
+        value = min(value, *((__global DATA_TYPE *)
+		    	     tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+      #else
+	return;
+
+      #endif
+
+      if(tval!=value)
+      {
+        idx = indices[axis];
+        tval = value;
+      }
+    }
+
+    *((__global uint *)out.ptr) = idx;
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
new file mode 100644
index 000000000..2ed698951
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+/** This function subtracts one tensors from another.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void arithmetic_sub_ex(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    // Calculate and store result
+    vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
index 0c0a9ede6..5cd0a4309 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -2,32 +2,20 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2016, 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers_asymm.h"
 
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-#endif /* FIXED_POINT_POSITION */
-
 #ifdef SATURATE
 #define ADD(x, y) add_sat((x), (y))
 #define SUB(x, y) sub_sat((x), (y))
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
new file mode 100644
index 000000000..ad6a48a02
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
+/** Perform batch to space rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nd(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    out_index[0] = get_global_id(0);//W
+    out_index[1] = get_global_id(1);//H
+    out_index[2] = get_global_id(2) % DEPTH_OUT;//C
+    out_index[3] = get_global_id(2) / DEPTH_OUT;//N
+
+    in_index[0] = out_index[0]/BLOCK_SIZE1;
+    in_index[1] = out_index[1]/BLOCK_SIZE0;
+    in_index[2] = out_index[2];
+    in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
new file mode 100644
index 000000000..bea61f53e
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(OP_CODE) && defined(DATA_TYPE)
+/** returns truth value of the two input tensors for BINARY LOGICAL OP.
+ * where BINARY LOGICAL OP can be AND, OR.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[in]  input2_ptr                            Pointer to the source tensor.Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ */
+__kernel void binary_logical_op(
+    TENSOR3D_DECLARATION(input1),
+    TENSOR3D_DECLARATION(input2),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input1  = CONVERT_TO_TENSOR3D_STRUCT(input1);
+    Tensor3D input2  = CONVERT_TO_TENSOR3D_STRUCT(input2);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    #if OP_CODE == 1 // LOGICAL AND
+     VSTORE(VEC_SIZE)
+     (CONVERT(VLOAD(VEC_SIZE)
+	     (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+
+    #elif OP_CODE == 2 // LOGICAL OR
+     VSTORE(VEC_SIZE)
+     (CONVERT(VLOAD(VEC_SIZE)
+	     (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+
+    #else // OP NOT SUPPORTED
+     return
+
+    #endif
+}
+#endif //if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
index 113804cca..3d4675e5d 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -2,38 +2,34 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers.h"
 
-#ifndef SCALE_IN
-#define SCALE_IN    1.0f
+#ifndef SCALE
+#define SCALE    1.0f
+#endif
+#ifndef OFFSET
+#define OFFSET   0
 #endif
-#ifndef OFFSET_IN
-#define OFFSET_IN   0
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
 #endif
 
+#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
 /** Perform a cast operation on an input tensor.
  *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
@@ -65,9 +61,9 @@ __kernel void cast(
                      0, (__global DATA_TYPE_OUT *)output.ptr);
 }
 
-
 /** Perform a cast operation on an QASYMM8 input tensor.
- *
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
@@ -96,8 +92,8 @@ __kernel void cast_qasymm_in(
 
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
         VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
-    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
 
     VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
     VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
@@ -108,7 +104,8 @@ __kernel void cast_qasymm_in(
 
 
 /** Perform a cast operation on an QASYMM8 output tensor.
- *
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
@@ -137,8 +134,8 @@ __kernel void cast_qasymm_out(
 
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
         VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
-    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
 
     VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
     VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
@@ -146,3 +143,4 @@ __kernel void cast_qasymm_out(
     VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
                      0, (__global DATA_TYPE_OUT *)output.ptr);
 }
+#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
new file mode 100644
index 000000000..765072556
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
+/** Returns truth value of comparison operators.
+ * Comparison operators may be equal, not_equal etc.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT,
+ * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[in]  input2_ptr                            Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void comparison_op(
+    TENSOR3D_DECLARATION(input1),
+    TENSOR3D_DECLARATION(input2),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+    Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    #if OP_CODE == 1 //EQUAL
+     VSTORE(VEC_SIZE)
+     (CONVERT(VLOAD(VEC_SIZE)
+             (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
+             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr);
+
+    #elif OP_CODE == 2 //NOT_EQUAL
+     VSTORE(VEC_SIZE)
+     (CONVERT(VLOAD(VEC_SIZE)
+             (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
+             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+
+    #else // OP NOT SUPPORTED
+     return;
+
+    #endif
+}
+#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
new file mode 100644
index 000000000..1eb305f7b
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+#ifndef VEC_SIZE
+#define VEC_SIZE  1
+#endif
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+
+/** Returns the truth value of comparison .
+ * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[in]  input2_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void comparison_op_qasymm8(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+    #if OPCODE == 1 //EQUAL QUANTIZED
+      VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
+
+    #elif OPCODE == 2 //NOT EQUAL QUANTIZED
+      VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
+
+    #else // OP NOT SUPPORTED
+      return;
+
+    #endif
+}
+#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
new file mode 100644
index 000000000..fef2243e7
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void depth_to_space(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    out_index[0] = get_global_id(0);//W
+    out_index[1] = get_global_id(1);//H
+    out_index[2] = get_global_id(2) % DEPTH_OUT;//C
+    out_index[3] = get_global_id(2) / DEPTH_OUT;//B
+
+    in_index[0] = out_index[0]/BLOCK_SIZE;
+    in_index[1] = out_index[1]/BLOCK_SIZE;
+    in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
+    in_index[3] = out_index[3];
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
new file mode 100644
index 000000000..348458fe9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform embedding_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
+ */
+
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input),
+                               TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
+{
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+    //lookup ids for based on the tensor dimensions
+    int lup_id[4] = {0};
+
+    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+	                       :get_global_id(0);
+    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+	                       :get_global_id(1);
+    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+	                       :get_global_id(2)%DEPTH_OUT;
+    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                               :get_global_id(2) / DEPTH_OUT;
+
+    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+                     0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
new file mode 100644
index 000000000..69d94f30a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Perform an exponential operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void exp_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)
+    (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
deleted file mode 100644
index 7807533e2..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
+++ /dev/null
@@ -1,565 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_FIXED_POINT_H
-#define ARM_COMPUTE_FIXED_POINT_H
-
-#define TYPE_ALIAS(type, alias) \
-  typedef type alias;           \
-  typedef type alias##x##1;     \
-  typedef type##2 alias##x##2;  \
-  typedef type##3 alias##x##3;  \
-  typedef type##4 alias##x##4;  \
-  typedef type##8 alias##x##8;  \
-  typedef type##16 alias##x##16;
-
-TYPE_ALIAS(char, qs8)
-TYPE_ALIAS(short, qs16)
-TYPE_ALIAS(int, qs32)
-
-#define qs8_MIN ((char)CHAR_MIN)
-#define qs8_MAX ((char)CHAR_MAX)
-#define qs16_MIN ((short)SHRT_MIN)
-#define qs16_MAX ((short)SHRT_MAX)
-#define qs32_MIN ((int)INT_MIN)
-#define qs32_MAX ((int)INT_MAX)
-
-#define qu8_MIN ((uchar)0)
-#define qu8_MAX ((uchar)UCHAR_MAX)
-#define qu16_MIN ((ushort)0)
-#define qu16_MAX ((ushort)USHRT_MAX)
-#define qu32_MIN ((uint)0)
-#define qu32_MAX ((uint)UINT_MAX)
-
-#define qs8_TYPE char
-#define qs8x1_TYPE char
-#define qs8x2_TYPE char2
-#define qs8x3_TYPE char3
-#define qs8x4_TYPE char4
-#define qs8x8_TYPE char8
-#define qs8x16_TYPE char16
-
-#define qs16_TYPE short
-#define qs16x1_TYPE short
-#define qs16x2_TYPE short2
-#define qs16x3_TYPE short3
-#define qs16x4_TYPE short4
-#define qs16x8_TYPE short8
-#define qs16x16_TYPE short16
-
-#define qs32_TYPE int
-#define qs32x1_TYPE int
-#define qs32x2_TYPE int2
-#define qs32x3_TYPE int3
-#define qs32x4_TYPE int4
-#define qs32x8_TYPE int8
-#define qs32x16_TYPE int16
-
-/* All internal constants are represented in the maximum supported fixed point format (QS16),
- * thus we define an additional shift parameter required to convert the constant
- * from the maximum supported format to the require one.
- */
-#define qs8_SHIFT 8
-#define qs16_SHIFT 0
-
-#undef VEC_DATA_TYPE_STR
-#undef VEC_DATA_TYPE
-#undef CONVERT_STR
-#undef CONVERT
-#undef CONVERT_SAT_STR
-#undef CONVERT_SAT
-
-#define VEC_DATA_TYPE_STR(type, size) type##x##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-
-#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x)))
-#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype)
-#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE)
-#define CONVERT(x, type) CONVERT_STR(x, type)
-
-#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x)))
-#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype)
-#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE)
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-/** Computes saturating absolute value of fixed point vector.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point absolute value.
- */
-#define ABSQ_SAT_IMPL(type) \
-  inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); }
-
-ABSQ_SAT_IMPL(qs8x16)
-ABSQ_SAT_IMPL(qs16x8)
-
-#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a))
-#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
-
-/** Computes max of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point maximum.
- */
-#define MAXQ_IMPL(type) \
-  inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); }
-
-MAXQ_IMPL(qs8x1)
-MAXQ_IMPL(qs8x2)
-MAXQ_IMPL(qs8x4)
-MAXQ_IMPL(qs8x8)
-MAXQ_IMPL(qs8x16)
-MAXQ_IMPL(qs16x1)
-MAXQ_IMPL(qs16x2)
-MAXQ_IMPL(qs16x4)
-MAXQ_IMPL(qs16x8)
-MAXQ_IMPL(qs16x16)
-
-#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
-#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
-
-/** Computes saturated addition of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point addition. The result is saturated in case of overflow
- */
-#define ADDQ_SAT_IMPL(type) \
-  inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); }
-
-ADDQ_SAT_IMPL(qs8x1)
-ADDQ_SAT_IMPL(qs8x2)
-ADDQ_SAT_IMPL(qs8x4)
-ADDQ_SAT_IMPL(qs8x8)
-ADDQ_SAT_IMPL(qs8x16)
-ADDQ_SAT_IMPL(qs16x1)
-ADDQ_SAT_IMPL(qs16x2)
-ADDQ_SAT_IMPL(qs16x4)
-ADDQ_SAT_IMPL(qs16x8)
-ADDQ_SAT_IMPL(qs16x16)
-ADDQ_SAT_IMPL(qs32x1)
-ADDQ_SAT_IMPL(qs32x2)
-ADDQ_SAT_IMPL(qs32x4)
-ADDQ_SAT_IMPL(qs32x8)
-ADDQ_SAT_IMPL(qs32x16)
-
-#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b))
-#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
-
-/** Computes saturated subtraction of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point subtraction. The result is saturated in case of overflow
- */
-#define SUBQ_SAT_IMPL(type) \
-  inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); }
-
-SUBQ_SAT_IMPL(qs8x1)
-SUBQ_SAT_IMPL(qs8x2)
-SUBQ_SAT_IMPL(qs8x4)
-SUBQ_SAT_IMPL(qs8x8)
-SUBQ_SAT_IMPL(qs8x16)
-SUBQ_SAT_IMPL(qs16x1)
-SUBQ_SAT_IMPL(qs16x2)
-SUBQ_SAT_IMPL(qs16x4)
-SUBQ_SAT_IMPL(qs16x8)
-SUBQ_SAT_IMPL(qs16x16)
-
-#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
-#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
-
-/* Multiply of two fixed point numbers
- *
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiplication.
- */
-#define MULQ_IMPL(type, itype)                                               \
-  inline type mul_##type(type VopA, type VopB, int fixed_point_position)     \
-  {                                                                          \
-    itype round_val = (itype)(1 << (fixed_point_position - 1));              \
-    itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \
-    return CONVERT((res >> (itype)fixed_point_position), type);              \
-  }
-
-MULQ_IMPL(qs8x8, qs16x8)
-MULQ_IMPL(qs16x8, qs32x8)
-MULQ_IMPL(qs8x16, qs16x16)
-MULQ_IMPL(qs16x16, qs32x16)
-
-#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position))
-#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position)
-
-/* Saturate multiply of two fixed point numbers
- *
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiplication. The result is saturated in case of overflow
- */
-#define MULQ_SAT_IMPL(type, itype)                                                  \
-  inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position)        \
-  {                                                                                 \
-    itype round_val = (itype)(1 << (fixed_point_position - 1));                     \
-    itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \
-    return CONVERT_SAT((res >> (itype)fixed_point_position), type);                 \
-  }
-
-MULQ_SAT_IMPL(qs8x1, qs16x1)
-MULQ_SAT_IMPL(qs8x2, qs16x2)
-MULQ_SAT_IMPL(qs8x3, qs16x3)
-MULQ_SAT_IMPL(qs8x4, qs16x4)
-MULQ_SAT_IMPL(qs8x8, qs16x8)
-MULQ_SAT_IMPL(qs8x16, qs16x16)
-MULQ_SAT_IMPL(qs16x1, qs32x1)
-MULQ_SAT_IMPL(qs16x2, qs32x2)
-MULQ_SAT_IMPL(qs16x3, qs32x3)
-MULQ_SAT_IMPL(qs16x4, qs32x4)
-MULQ_SAT_IMPL(qs16x8, qs32x8)
-MULQ_SAT_IMPL(qs16x16, qs32x16)
-
-#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \
-  mul_sat_##type##x##size((a), (b), (position))
-#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \
-  MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
-
-/** Saturate multiply-accumulate
- *
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiply-accumulate. The result is saturated in case of
- * overflow
- */
-#define MLAQ_SAT_IMPL(type, itype)                                               \
-  type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \
-  {                                                                              \
-    itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype),              \
-                        (itype)(1 << (fixed_point_position - 1)));               \
-    return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \
-  }
-
-MLAQ_SAT_IMPL(qs8x8, qs16x8)
-MLAQ_SAT_IMPL(qs8x16, qs16x16)
-MLAQ_SAT_IMPL(qs16x8, qs32x8)
-
-#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
-  mla_sat_##type##x##size((a), (b), (c), (position))
-#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \
-  MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
-
-/** Saturate multiply-accumulate long
- *
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiply-accumulate long. The result is saturated in case
- * of overflow
- */
-#define MLALQ_SAT_IMPL(type, itype)                                                 \
-  itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \
-  {                                                                                 \
-    itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype),                 \
-                        (itype)(1 << (fixed_point_position - 1)));                  \
-    return add_sat(VopA, res >> (itype)fixed_point_position);                       \
-  }
-
-MLALQ_SAT_IMPL(qs8x8, qs16x8)
-MLALQ_SAT_IMPL(qs16x8, qs32x8)
-
-#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
-  mlal_sat_##type##x##size((a), (b), (c), (position))
-#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \
-  MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
-
-/** Saturate division of two fixed point vectors
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point division. The result is saturated in case of overflow
- */
-#define DIVQ_SAT_IMPL(stype, type, itype)                                                          \
-  inline type div_sat_##type(type VopA, type VopB, int fixed_point_position)                       \
-  {                                                                                                \
-    itype conv_a = CONVERT((VopA), itype);                                                         \
-    itype denominator = CONVERT((VopB), itype);                                                    \
-    itype numerator = conv_a << (itype)(fixed_point_position);                                     \
-    itype res = select((itype)(numerator / denominator),                                           \
-                       select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \
-                       (itype)(denominator == (itype)0));                                          \
-    return CONVERT_SAT((res), type);                                                               \
-  }
-
-DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
-DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8)
-DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
-DIVQ_SAT_IMPL(qs8, qs8, qs16)
-DIVQ_SAT_IMPL(qs16, qs16, qs32)
-
-#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position))
-#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position)
-
-#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \
-  div_sat_##type##x##size((a), (b), (position))
-#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \
-  DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
-
-/** Saturate exponential of a fixed point vector
- *
- * @note Implemented approach uses taylor polynomial to approximate the exponential function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] size  the number of the calculated elements.
- *
- * @return The result of the fixed point exponential. The result is saturated in case of overflow
- */
-#define EXPQ_IMPL(stype, type, size)                                                            \
-  inline type exp_sat_##type(type VopA, int fixed_point_position)                               \
-  {                                                                                             \
-    type const_one = (type)(1 << (fixed_point_position));                                       \
-    type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1);                    \
-    type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one;    \
-    type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1);                        \
-    type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1);                        \
-    type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1);                        \
-    type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1);                        \
-    type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position);               \
-    type dec_m = m >> (type)fixed_point_position;                                               \
-    type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size,       \
-                                   fixed_point_position);                                       \
-    alpha = CONVERT(abs_diff(VopA, alpha), type);                                               \
-    type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C);      \
-    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B);         \
-    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A);         \
-    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \
-    return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0),      \
-                  clz(sum) > dec_m); /* Saturate result if needed */                            \
-  }
-
-EXPQ_IMPL(qs8, qs8x2, 2)
-EXPQ_IMPL(qs8, qs8x4, 4)
-EXPQ_IMPL(qs8, qs8x8, 8)
-EXPQ_IMPL(qs8, qs8x16, 16)
-EXPQ_IMPL(qs16, qs16x2, 2)
-EXPQ_IMPL(qs16, qs16x4, 4)
-EXPQ_IMPL(qs16, qs16x8, 8)
-EXPQ_IMPL(qs16, qs16x16, 16)
-
-#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
-#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate logarithm of a fixed point vector
- *
- * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] size  the number of the calculated elements.
- *
- * @return The result of the fixed point logarithm. The result is saturated in case of overflow
- */
-#define LOGQ_IMPL(stype, type, size)                                                            \
-  inline type log_sat_##type(type VopA, int fixed_point_position)                               \
-  {                                                                                             \
-    type const_one = (type)(1 << (fixed_point_position));                                       \
-    type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */                   \
-    type A = (type)(0x5C0F >> (14 - fixed_point_position));   /* 1.4384189 */                   \
-    type B = -(type)(0x56AE >> (15 - fixed_point_position));  /* -0.6771900 */                  \
-    type C = (type)(0x2933 >> (15 - fixed_point_position));   /* 0.3218538 */                   \
-    type D = -(type)(0x0AA7 >> (15 - fixed_point_position));  /* -0.0832229 */                  \
-    type inter_a =                                                                              \
-        select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \
-               VopA < const_one);                                                               \
-    type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position);   \
-    inter_a = inter_a >> shift_val;                                                             \
-    inter_a = sub_sat(inter_a, const_one);                                                      \
-    type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C);    \
-    sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B);       \
-    sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A);       \
-    sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position);                   \
-    sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype,  \
-                            size, fixed_point_position);                                        \
-    return select(select(sum, -sum, VopA < const_one), (type)0,                                 \
-                  VopA < (type)0); /* Saturate result if needed */                              \
-  }
-
-LOGQ_IMPL(qs8, qs8x16, 16)
-LOGQ_IMPL(qs16, qs16x8, 8)
-LOGQ_IMPL(qs16, qs16x16, 16)
-
-#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position))
-#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate inverse square root of a fixed point vector
- *
- * @note Implemented approach uses Newton's method to approximate the inverse square root function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] size  the number of the calculated elements.
- *
- * @return The result of the fixed point inverse square root. The result is saturated in case of
- * overflow
- */
-#define INVSQRTQ_IMPL(stype, type, size)                                                           \
-  inline type invsqrt_sat_##type(type VopA, int fixed_point_position)                              \
-  {                                                                                                \
-    type const_three = (type)(3 << (fixed_point_position));                                        \
-    type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position);      \
-    type temp = select((type)(VopA >> shift_value),                                                \
-                       select((type)stype##_MAX, (type)(VopA << (-shift_value)),                   \
-                              (type)(clz(VopA) > (-shift_value))),                                 \
-                       (type)(shift_value < (type)0));                                             \
-    type x = temp;                                                                                 \
-    x = MUL_SAT_OP_EXPAND(                                                                         \
-            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
-                                                                        fixed_point_position),     \
-                                                      temp, stype, size, fixed_point_position)),   \
-            stype, size, fixed_point_position) >>                                                  \
-        1;                                                                                         \
-    x = MUL_SAT_OP_EXPAND(                                                                         \
-            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
-                                                                        fixed_point_position),     \
-                                                      temp, stype, size, fixed_point_position)),   \
-            stype, size, fixed_point_position) >>                                                  \
-        1;                                                                                         \
-    x = MUL_SAT_OP_EXPAND(                                                                         \
-            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
-                                                                        fixed_point_position),     \
-                                                      temp, stype, size, fixed_point_position)),   \
-            stype, size, fixed_point_position) >>                                                  \
-        1;                                                                                         \
-    if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */                  \
-    {                                                                                              \
-      x = MUL_SAT_OP_EXPAND(                                                                       \
-              x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,       \
-                                                                          fixed_point_position),   \
-                                                        temp, stype, size, fixed_point_position)), \
-              stype, size, fixed_point_position) >>                                                \
-          1;                                                                                       \
-      x = MUL_SAT_OP_EXPAND(                                                                       \
-              x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,       \
-                                                                          fixed_point_position),   \
-                                                        temp, stype, size, fixed_point_position)), \
-              stype, size, fixed_point_position) >>                                                \
-          1;                                                                                       \
-    }                                                                                              \
-    type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0);      \
-    return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2),  \
-                                                    (type)(clz(x) > shift_value2)),                \
-                  (type)(shift_value < (type)0)); /* Saturate result if needed */                  \
-  }
-
-INVSQRTQ_IMPL(qs8, qs8x1, 1)
-INVSQRTQ_IMPL(qs16, qs16x1, 1)
-INVSQRTQ_IMPL(qs8, qs8x16, 16)
-INVSQRTQ_IMPL(qs16, qs16x8, 8)
-
-#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position))
-#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate hyperbolic tangent of a fixed point vector
- *
- * tanh(x) = (e^2x - 1)/(e^2x + 1)
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] size  the number of the calculated elements.
- *
- * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of
- * overflow
- */
-#define TANHQ_IMPL(stype, type, size)                                                        \
-  inline type tanh_sat_##type(type VopA, int fixed_point_position)                           \
-  {                                                                                          \
-    type const_one = (type)(1 << (fixed_point_position));                                    \
-    type const_two = (type)(2 << (fixed_point_position));                                    \
-    type exp2x =                                                                             \
-        EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \
-                      stype, size, fixed_point_position);                                    \
-    type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size);                             \
-    type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size);                             \
-    return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position);               \
-  }
-
-TANHQ_IMPL(qs8, qs8x16, 16)
-TANHQ_IMPL(qs16, qs16x8, 8)
-
-#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position))
-#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position)
-
-#define floatx16 float16
-#define float16_TYPE float16
-
-#define CONVERTQ_DOWN_IMPL(in_type, out_type)                                           \
-  inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position)   \
-  {                                                                                     \
-    return CONVERT(a * (1 << fixed_point_position) +                                    \
-                       select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
-                   out_type);                                                           \
-  }
-
-CONVERTQ_DOWN_IMPL(float16, qs8x16)
-CONVERTQ_DOWN_IMPL(float16, qs16x16)
-
-#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type)                                           \
-  inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \
-  {                                                                                         \
-    return CONVERT_SAT(a * (1 << fixed_point_position) +                                    \
-                           select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
-                       out_type);                                                           \
-  }
-
-CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)
-CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16)
-
-#define CONVERTQ_UP_IMPL(in_type, out_type)                                           \
-  inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
-  {                                                                                   \
-    return CONVERT(a, out_type) / (1 << fixed_point_position);                        \
-  }
-
-CONVERTQ_UP_IMPL(qs8x16, float16)
-CONVERTQ_UP_IMPL(qs16x16, float16)
-
-#define SQCVT_SAT_IMPL(type)                                                                \
-  inline type sqcvt_##type##_sat(float a, int fixed_point_position)                         \
-  {                                                                                         \
-    return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \
-  }
-
-SQCVT_SAT_IMPL(qs8)
-SQCVT_SAT_IMPL(qs16)
-
-#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position))
-#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position)
-
-#endif // ARM_COMPUTE_FIXED_POINT_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
index 25e20f5f2..6b767d6c9 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers.h"
 
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
new file mode 100644
index 000000000..ed7409852
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform hashtable_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
+ */
+__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input),
+                               TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
+{
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+    int lup_id[4] = {0};
+
+    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+	                       :get_global_id(0);
+    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+	                       :get_global_id(1);
+    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+	                       :get_global_id(2)%DEPTH_OUT;
+    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                               :get_global_id(2) / DEPTH_OUT;
+
+    if (lup_id[NUM_DIMS-1] < 0)
+    {
+      VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
+      return;
+    }
+
+    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+                     0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 8143d2398..0e123ae0a 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -24,15 +24,23 @@
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-#if defined(cl_arm_printf)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+    defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+       // defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 #pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(cl_arm_printf)
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 
 #define EXPAND(x) x
 
@@ -175,7 +183,7 @@ typedef struct Tensor4D
  *
  * @return An image object
  */
-Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
                                          uint stride_x, uint step_x)
 {
   Vector vector = {
@@ -201,7 +209,7 @@ Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
  *
  * @return An image object
  */
-Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
                                        uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
   Image img = {.ptr = ptr,
@@ -230,7 +238,7 @@ Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
  *
  * @return A 3D tensor object
  */
-Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
                                                      uint offset_first_element_in_bytes,
                                                      uint stride_x, uint step_x, uint stride_y,
                                                      uint step_y, uint stride_z, uint step_z)
@@ -261,7 +269,7 @@ Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
  *
  * @return A 3D tensor object
  */
-Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr,
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
                                              uint offset_first_element_in_bytes, uint stride_x,
                                              uint step_x, uint stride_y, uint step_y, uint stride_z,
                                              uint step_z)
@@ -276,7 +284,7 @@ Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr,
   return tensor;
 }
 
-Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr,
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
                                              uint offset_first_element_in_bytes, uint stride_x,
                                              uint step_x, uint stride_y, uint step_y, uint stride_z,
                                              uint step_z, uint stride_w, uint step_w, uint mod_size)
@@ -299,7 +307,7 @@ Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr,
  * @param[in] vec Pointer to the starting position of the buffer
  * @param[in] x   Relative X position
  */
-__global inline const uchar *vector_offset(const Vector *vec, int x)
+inline __global const uchar *vector_offset(const Vector *vec, int x)
 {
   return vec->ptr + x * vec->stride_x;
 }
@@ -310,7 +318,7 @@ __global inline const uchar *vector_offset(const Vector *vec, int x)
  * @param[in] x   Relative X position
  * @param[in] y   Relative Y position
  */
-__global inline uchar *offset(const Image *img, int x, int y)
+inline __global uchar *offset(const Image *img, int x, int y)
 {
   return img->ptr + x * img->stride_x + y * img->stride_y;
 }
@@ -322,7 +330,7 @@ __global inline uchar *offset(const Image *img, int x, int y)
  * @param[in] y      Relative Y position
  * @param[in] z      Relative Z position
  */
-__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
 {
   return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
 }
@@ -335,7 +343,7 @@ __global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int
  * @param[in] z      Relative Z position
  * @param[in] w      Relative W position
  */
-__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
 {
   return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
          w * tensor->stride_w;
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
new file mode 100644
index 000000000..e3aa463db
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Performs a negation of input tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: S16/S32/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void neg_tensor(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)
+    (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
new file mode 100644
index 000000000..ecf4696e9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ *
+ * @param[in]  pad_values                           Padding values for each of the dimensions. Only pad values for Up(for
+ *                                                  batch), Top(for height), Left(for width) and Front(for depth) are
+ *				                    required. Supported data type: S32
+ */
+
+__kernel void pad(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output),
+     const int4 pad_values)
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int index[4]={0};
+
+    index[0] = get_global_id(0);//W
+    index[1] = get_global_id(1);//H
+    index[2] = get_global_id(2) % DEPTH_OUT;//C
+    index[3] = get_global_id(2) / DEPTH_OUT;//N
+
+    if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) ||
+        index[1] < pad_values.y || index[1] >= (IH + pad_values.y) ||
+        index[2] < pad_values.z || index[2] >= (ID + pad_values.z) ||
+        index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
+    {
+        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+    }
+    else
+    {
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)
+                                              tensor4D_offset(&in, index[0] - pad_values.x,
+                                                              index[1] - pad_values.y,
+                                                              index[2] - pad_values.z,
+                                                              index[3] - pad_values.w));
+    }
+ }
+
+#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
new file mode 100644
index 000000000..7cc8b0354
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/** Perform a Generic permute operation on an input tensor of Shape DCHW.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1
+6/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in b
+ytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in b
+ytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in b
+ytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute_generic(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+   int out_index[4];
+   int in_index[4];
+   in_index[0] = get_global_id(0);//W
+   in_index[1] = get_global_id(1);//H
+   in_index[2] = get_global_id(2) % DEPTH_IN;//C
+   in_index[3] = get_global_id(2) / DEPTH_IN;//B
+   out_index[0] = in_index[P1];
+   out_index[1] = in_index[P2];
+   out_index[2] = in_index[P3];
+   out_index[3] = in_index[P4];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
index 512c62023..aa05121b1 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2016, 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers.h"
 
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
index 82edf3b1d..fdfb78003 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
@@ -2,40 +2,20 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2016, 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers.h"
 
-#if defined(FIXED_POINT_POSITION)
-
-#include "fixed_point.h"
-
-#if defined(SATURATE)
-#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#else // SATURATE
-#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#endif // SATURATE
-
-#else // FIXED_POINT_POSITION
-
 #if defined(SATURATE)
 #define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
 #else // SATURATE
@@ -45,17 +25,14 @@
 
 #define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
 
-#endif // FIXED_POINT_POSITION
-
 /** Performs a pixelwise division with integer scale of integer inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
  * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
  * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/QS8/QS16/S16
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -79,7 +56,7 @@
  * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32
  */
 __kernel void pixelwise_div_int(
     TENSOR3D_DECLARATION(in1),
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
index ddc9d5a27..ab1307e64 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2016, 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers_asymm.h"
 
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
new file mode 100644
index 000000000..68da2ba32
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported Data types : F16/F32
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[in]  alpha_ptr                            Pointer to the source image. Supported Data types : F16/F32
+ * @param[in]  alpha_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  alpha_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  alpha_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  alpha_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  alpha_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  alpha_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  alpha_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void prelu(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(alpha),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D alpha  = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)
+      (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ?
+                       VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) :
+	               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
+       0, (__global DATA_TYPE *)output.ptr);
+
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
new file mode 100644
index 000000000..7e97b7ed6
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take uchar data types.
+ *
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported Data types : QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[in]  alpha_ptr                            Pointer to the source image. Supported Data types : QASYMM8
+ * @param[in]  alpha_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  alpha_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  alpha_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  alpha_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  alpha_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  alpha_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  alpha_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void prelu_qasymm8(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(alpha),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+    const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32;
+    const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
+    const VEC_UCHAR res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global uchar *)output.ptr);
+}
+
+#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
deleted file mode 100644
index dfa3b85f4..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(WIDTH)
-/** Perform reduce max
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types:  F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[out] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[out] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reduce_max(VECTOR_DECLARATION(input),
-                         VECTOR_DECLARATION(output))
-{
-    Vector input = CONVERT_TO_VECTOR_STRUCT(input);
-    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
-
-    __global float *input_addr = (__global float *)(input.ptr);
-    __global float *output_addr = (__global float *)(output.ptr);
-
-    float max_value = *input_addr;
-    for(int x = 1; x < WIDTH; x++)
-    {
-        float value = *(input_addr + x);
-        max_value = max(value, max_value);
-    }
-
-    // Store max
-    *output_addr = max_value;
-}
-#endif // defined(WIDTH)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
new file mode 100644
index 000000000..8bef49363
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform reduce max/min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_min_max(TENSOR4D_DECLARATION(input),
+                             TENSOR4D_DECLARATION(output),
+                             const int axis,
+                             const int dim)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int indices[4] =
+    {
+        get_global_id(0),
+        get_global_id(1),
+        get_global_id(2) % DEPTH_OUT,
+        get_global_id(2) / DEPTH_OUT,
+    };
+
+    DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    for(int i = 1; i < dim; ++i)
+    {
+      indices[axis] = i;
+
+      #if OP_CODE == 1 // REDUCE_MAX
+       value = max(value, *((__global DATA_TYPE *)
+			   tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+
+      #elif OP_CODE == 2 // REDUCE_MIN
+       value = min(value, *((__global DATA_TYPE *)
+			   tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+
+      #else // OP NOT SUPPORTED
+       return;
+
+      #endif
+    }
+
+    *((__global DATA_TYPE *)out.ptr) = value;
+}
+
+/** Perform reduce sum/mean
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input),
+                              TENSOR4D_DECLARATION(output),
+                              const int axis,
+                              const int dim)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int indices[4] =
+    {
+        get_global_id(0),
+        get_global_id(1),
+        get_global_id(2) % DEPTH_OUT,
+        get_global_id(2) / DEPTH_OUT,
+    };
+
+    DATA_TYPE sum_value = (DATA_TYPE)0;
+    for(int i = 0; i < dim; ++i)
+    {
+      indices[axis] = i;
+      sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    }
+
+    #if OP_CODE == 3 // REDUCE_SUM
+     *((__global DATA_TYPE *)out.ptr) = sum_value;
+
+    #elif OP_CODE == 4 // REDUCE_MEAN
+     *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
+
+    #else // OP NOT SUPPORTED
+     return;
+
+    #endif
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
deleted file mode 100644
index 1a96eea61..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-inline DATA_TYPE sum_8(__global const DATA_TYPE *input)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    in = vload8(0, input);
-    in.s0123 += in.s4567;
-    in.s01 += in.s23;
-    return ((in.s0 + in.s1));
-}
-
-/** This function calculates the sum and sum of squares of a given input image.
- *
- * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] local_sum                         Local sum of all elements
- * @param[in]  height                            Height of the input image
- * @param[in]  divider                           Divider to calculate mean
- */
-__kernel void reduction_mean(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    __local DATA_TYPE *local_sums, 
-    int height,
-    int divider)
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    float8 tmp_sum = 0;
-    // Calculate partial sum
-
-    for(int i = 0; i < height; i++)
-    {
-        local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i));
-    }
-    ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider;
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
new file mode 100644
index 000000000..a0fc2d5a9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+/** Perform space to batch with input of 4D and NCHW format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
+                                     TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int block_size_x = *((__global int *)(block_size_ptr));
+    int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+    int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
+    int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
+
+    int in_index[4] = {0, };
+    in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+    in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+    in_index[2] = get_global_id(2) % DEPTH_OUT;
+    in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
+
+    if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
+    {
+        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+    }
+    else
+    {
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
+    }
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+/** Perform space to batch with input of 4D and NHWC format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
+                                     TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+
+    int block_size_x = *((__global int *)(block_size_ptr));
+    int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+    int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+    int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+
+    int in_index[4] = {0, };
+    in_index[0] = get_global_id(0) * VEC_SIZE;
+    in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+    in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+    in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+
+    if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+    {
+        VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+    }
+    else
+    {
+        VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
+                                 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+                         0, (__global DATA_TYPE *)out.ptr);
+    }
+}
+
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
new file mode 100644
index 000000000..f6977045a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_depth(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    in_index[0] = get_global_id(0);//W
+    in_index[1] = get_global_id(1);//H
+    in_index[2] = get_global_id(2) % DEPTH_IN;//C
+    in_index[3] = get_global_id(2) / DEPTH_IN;//B
+
+    out_index[0] = in_index[0]/BLOCK_SIZE;
+    out_index[1] = in_index[1]/BLOCK_SIZE;
+    out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+    out_index[3] = in_index[3];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) =                                                                                                 *((__global DATA_TYPE *)in.ptr);
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
new file mode 100644
index 000000000..3e1a5c97f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Returns true value of squared_difference of two tensors.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[in]  input2_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[out] output_ptr                            Pointer to the destination image. Supported data types: F16/F32
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void squared_difference(
+    TENSOR3D_DECLARATION(input1),
+    TENSOR3D_DECLARATION(input2),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input1  = CONVERT_TO_TENSOR3D_STRUCT(input1);
+    Tensor3D input2  = CONVERT_TO_TENSOR3D_STRUCT(input2);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    sq_diff = diff * diff;
+
+    VSTORE(VEC_SIZE)
+    (sq_diff, 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
deleted file mode 100644
index c5ff82f9e..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-
-inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w)
-{
-    int stride_x = vector->stride_x;
-    int stride_y = stride_x * dim_x;
-    int stride_z = stride_y * dim_y;
-    int stride_w = stride_z * dim_z;
-    Tensor4D tensor =
-    {
-        .ptr                           = vector->ptr,
-        .offset_first_element_in_bytes = vector->offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z,
-        .stride_w                      = stride_w,
-    };
-    return tensor;
-}
-
-/** Extracts a strided slice up to 4-dimensions
- *
- * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
- * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  dims_in                              The 4-dimensional dimension of the input. Supported data types: S32
- * @param[in]  dims_out                             The 4-dimensional dimension of the output. Supported data types: S32
- * @param[in]  starts                               The stride of X dimension of input tensor to be sliced. Supported data types: S32
- * @param[in]  strides                              The stride of Y dimension of input tensor to be sliced. Supported data types: S32
- */
-__kernel void strided_slice(VECTOR_DECLARATION(input),
-                            VECTOR_DECLARATION(output),
-                            const int4 dims_in,
-                            const int4 dims_out,
-                            const int4 starts,
-                            const int4 strides)
-{
-    // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset
-    Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
-    Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
-
-    // Implemenation
-    // Infer a Tensor4D from output Vector and output's dimensions info
-    // Infer a Tensor4D from input Vector and input's dimensions info
-    // Infer indices of output as 4D from the offset of output vector
-    // Infer indices of input as 4D from indices of output
-    // out(offset of output vector) = in(offset of input)
-
-    Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
-    Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w);
-
-    // Must be output_step_x == output_stride_x == an element's size
-    const int offset_out = get_global_id(0) * output_stride_x;
-    int4 indices_out =
-    {
-            get_global_id(0) % dims_out.x,
-            (offset_out / tensor_out.stride_y) % dims_out.y,
-            (offset_out / tensor_out.stride_z) % dims_out.z,
-            (offset_out / tensor_out.stride_w) % dims_out.w,
-    };
-
-    int4 indices_in =
-    {
-            starts.x + (strides.x * indices_out.x),
-            starts.y + (strides.y * indices_out.y),
-            starts.z + (strides.z * indices_out.z),
-            starts.w + (strides.w * indices_out.w),
-    };
-
-    *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
new file mode 100644
index 000000000..b39c55b96
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
+/** Extracts a strided slice up to 4-dimensions
+ *
+ * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  starts                               The stride of X dimension of input tensor to be sliced. Supported data types: S32
+ * @param[in]  strides                              The stride of Y dimension of input tensor to be sliced. Supported data types: S32
+ */
+__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input),
+                               TENSOR4D_DECLARATION(output),
+                               const int4 starts,
+                               const int4 strides)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int4 indices_in =
+    {
+            starts.x + (strides.x * get_global_id(0)),
+            starts.y + (strides.y * get_global_id(1)),
+            starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)),
+            starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)),
+    };
+    *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
+}
+#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
index 0b0cf8218..d97f23a47 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "helpers.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
index deadf8412..0292fab04 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "helpers.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
index cac0c071e..c2c2d89a4 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 // reference: