summaryrefslogtreecommitdiff
path: root/libs/ARMComputeEx/src/core/CL/cl_kernels
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/cl_kernels')
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl89
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl94
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl74
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl126
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl70
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl84
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl146
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl86
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl93
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl69
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl84
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl57
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl98
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl88
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h352
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h406
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl48
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl86
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl72
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl88
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl80
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl111
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl74
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl88
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl152
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl163
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl69
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl75
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl63
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl103
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl130
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl271
32 files changed, 0 insertions, 3689 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
deleted file mode 100644
index f54c7bde3..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#define CONST_ONE 1.f
-#define DIV_OP(a, b) ((a) / (b))
-#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a)))
-
-// Inverse Square-root Activation
-inline TYPE rsqrt_op(TYPE x)
-{
- return RSQRT_OP(x);
-}
-
-#define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
-
-#if defined(ACT)
-
-/** This performs an activation function floating point inputs.
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void activation_layer_ex(
- TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
- ,
- TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
- Tensor3D output = input;
-#else /* IN_PLACE */
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
- // Load data
- TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
-
- // Perform activation
- data = ACTIVATION_OP(ACT, data);
-
- // Store result
- VSTORE(VEC_SIZE)
- (data, 0, (__global DATA_TYPE *)output.ptr);
-}
-
-#endif /* defined(ACT) */
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
deleted file mode 100644
index 9a6921d7c..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform arg_max/arg_min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr Pointer to the destination image. Supported data types: U32
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] axis Axis through which reduction occurs for max value index
- * @param[in] dim Dimension across the axis to be reduced.
- */
-
-__kernel void arg_op(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- const int axis,
- const int dim)
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int indices[4] =
- {
- get_global_id(0),
- get_global_id(1),
- get_global_id(2) % DEPTH_OUT,
- get_global_id(2) / DEPTH_OUT,
- };
-
- DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
- DATA_TYPE tval = value;
- int idx = 0;
- for(int i = 1; i < dim; ++i)
- {
- indices[axis] = i;
-
- #if OP_CODE == 1 // ArgMax
- value = max(value, *((__global DATA_TYPE *)
- tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
- #elif OP_CODE == 2 //ArgMin
- value = min(value, *((__global DATA_TYPE *)
- tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
- #else
- return;
-
- #endif
-
- if(tval!=value)
- {
- idx = indices[axis];
- tval = value;
- }
- }
-
- *((__global uint *)out.ptr) = idx;
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
deleted file mode 100644
index 2ed698951..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** This function subtracts one tensors from another.
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- *
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16
- * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16
- * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16
- * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_sub_ex(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out))
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load values
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
- // Calculate and store result
- vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
deleted file mode 100644
index 5cd0a4309..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * The following computations will be performed:
- *
- * -# Add offset terms to inputs
- -# Get scaled value of two inputs
- * -# Add inputs
- * -# Add offset terms to final result
- * -# Multiply each entry of result by result_mult_int
- * -# Shift the int32 accumulator by result_shift
- * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
- * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
- * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
- * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- *
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_add_qasymm8(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out))
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load data
- VEC_DATA_TYPE(int, 16)
- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
- VEC_DATA_TYPE(int, 16)
- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
- // Get scaled value of two inputs
- VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
- VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-
- VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
- VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
- VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
-
- VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
- VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
-
- // Add inputs and multiply with a multiplier smaller than 1
- VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
- VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
- out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
- VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
-// TODO: Apply min-max BOUND to support fuse with relu.
-/*
-#if defined(MIN_BOUND)
- res = max(res, (uchar16)MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
- res = min(res, (uchar16)MAX_BOUND);
-#endif // defined(MAX_BOUND)
-*/
-
- // Store result
- VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
- 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
deleted file mode 100644
index ad6a48a02..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
-/** Perform batch to space rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- *
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu
-t_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nd(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
- {
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int out_index[4]={0};
- int in_index[4]={0};
-
- out_index[0] = get_global_id(0);//W
- out_index[1] = get_global_id(1);//H
- out_index[2] = get_global_id(2) % DEPTH_OUT;//C
- out_index[3] = get_global_id(2) / DEPTH_OUT;//N
-
- in_index[0] = out_index[0]/BLOCK_SIZE1;
- in_index[1] = out_index[1]/BLOCK_SIZE0;
- in_index[2] = out_index[2];
- in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT;
-
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
- }
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
deleted file mode 100644
index bea61f53e..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(OP_CODE) && defined(DATA_TYPE)
-/** returns truth value of the two input tensors for BINARY LOGICAL OP.
- * where BINARY LOGICAL OP can be AND, OR.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor
- *
- * @param[in] input2_ptr Pointer to the source tensor.Supported data types: QASYMM8
- * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor
- *
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- */
-__kernel void binary_logical_op(
- TENSOR3D_DECLARATION(input1),
- TENSOR3D_DECLARATION(input2),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
- Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- #if OP_CODE == 1 // LOGICAL AND
- VSTORE(VEC_SIZE)
- (CONVERT(VLOAD(VEC_SIZE)
- (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
-
- #elif OP_CODE == 2 // LOGICAL OR
- VSTORE(VEC_SIZE)
- (CONVERT(VLOAD(VEC_SIZE)
- (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
-
- #else // OP NOT SUPPORTED
- return
-
- #endif
-}
-#endif //if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 3d4675e5d..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef SCALE
-#define SCALE 1.0f
-#endif
-#ifndef OFFSET
-#define OFFSET 0
-#endif
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
-/** Perform a cast operation on an input tensor.
- *
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void cast(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
- 0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 input tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void cast_qasymm_in(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
- VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
- VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
-
- VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
- 0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-
-/** Perform a cast operation on an QASYMM8 output tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: U8
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void cast_qasymm_out(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
- VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
- VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
-
- VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
- 0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
deleted file mode 100644
index 765072556..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
-/** Returns truth value of comparison operators.
- * Comparison operators may be equal, not_equal etc.
- *
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT,
- * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in] input1_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor
- *
- * @param[in] input2_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor
- *
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void comparison_op(
- TENSOR3D_DECLARATION(input1),
- TENSOR3D_DECLARATION(input2),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
- Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- #if OP_CODE == 1 //EQUAL
- VSTORE(VEC_SIZE)
- (CONVERT(VLOAD(VEC_SIZE)
- (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr);
-
- #elif OP_CODE == 2 //NOT_EQUAL
- VSTORE(VEC_SIZE)
- (CONVERT(VLOAD(VEC_SIZE)
- (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
-
- #else // OP NOT SUPPORTED
- return;
-
- #endif
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
deleted file mode 100644
index 1eb305f7b..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-
-/** Returns the truth value of comparison .
- * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor
- *
- * @param[in] input2_ptr Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor
- *
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void comparison_op_qasymm8(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out))
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
- VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
-
- in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
- in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
-
- const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
- const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-
- #if OPCODE == 1 //EQUAL QUANTIZED
- VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
-
- #elif OPCODE == 2 //NOT EQUAL QUANTIZED
- VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
-
- #else // OP NOT SUPPORTED
- return;
-
- #endif
-}
-#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index fef2243e7..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void depth_to_space(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
- {
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int out_index[4]={0};
- int in_index[4]={0};
-
- out_index[0] = get_global_id(0);//W
- out_index[1] = get_global_id(1);//H
- out_index[2] = get_global_id(2) % DEPTH_OUT;//C
- out_index[3] = get_global_id(2) / DEPTH_OUT;//B
-
- in_index[0] = out_index[0]/BLOCK_SIZE;
- in_index[1] = out_index[1]/BLOCK_SIZE;
- in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
- in_index[3] = out_index[3];
-
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
- }
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
deleted file mode 100644
index 348458fe9..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
-/** Perform embedding_lookup of input tensor
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
- * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32
- * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes)
- * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-
-__kernel void embedding_lookup(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- VECTOR_DECLARATION(lookups))
-{
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
-
- Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-
- //lookup ids for based on the tensor dimensions
- int lup_id[4] = {0};
-
- lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
- :get_global_id(0);
- lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
- :get_global_id(1);
- lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
- :get_global_id(2)%DEPTH_OUT;
- lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
- :get_global_id(2) / DEPTH_OUT;
-
- in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
- + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
-
- VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
- 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
deleted file mode 100644
index 69d94f30a..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Perform an exponential operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void exp_layer(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VSTORE(VEC_SIZE)
- (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
deleted file mode 100644
index 6b767d6c9..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-/** Perform gather
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in] input1_ptr Pointer to the first source tensor. Supported data types: U8/S32/F32
- * @param[in] input1_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input1_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] input1_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] input1_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] input2_ptr Pointer to the first source tensor. Supported data types: U32
- * @param[in] input2_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input2_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gather(IMAGE_DECLARATION(input1),
- VECTOR_DECLARATION(input2),
- IMAGE_DECLARATION(output))
-{
- Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
- Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2);
- Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
- in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
-
- //TODO: performance tuning for memcopy
- int index = in2_data.s0;
- int stride=input1_stride_y/input1_stride_x;
-
- for(int i=0; i<stride; i++){
- *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index));
- }
-}
-
-__kernel void gather_1d_out(IMAGE_DECLARATION(input1),
- VECTOR_DECLARATION(input2),
- VECTOR_DECLARATION(output))
-{
- Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
- Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2);
- Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
- in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
-
- //TODO: performance tuning for memcopy
- int index = in2_data.s0;
- int stride=input1_stride_y/input1_stride_x;
-
- for(int i=0; i<stride; i++){
- *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index));
- }
-}
-
-__kernel void gather_1d(VECTOR_DECLARATION(input1),
- VECTOR_DECLARATION(input2),
- VECTOR_DECLARATION(output))
-{
- Vector in1 = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1);
- Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2);
- Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
- in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
-
- //TODO: performance tuning for memcopy
- int index = in2_data.s0;
- *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
deleted file mode 100644
index ed7409852..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
-/** Perform hashtable_lookup of input tensor
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
- * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32
- * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes)
- * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- VECTOR_DECLARATION(lookups))
-{
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
-
- Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-
- int lup_id[4] = {0};
-
- lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
- :get_global_id(0);
- lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
- :get_global_id(1);
- lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
- :get_global_id(2)%DEPTH_OUT;
- lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
- :get_global_id(2) / DEPTH_OUT;
-
- if (lup_id[NUM_DIMS-1] < 0)
- {
- VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
- return;
- }
-
- in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
- + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
-
- VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
- 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
deleted file mode 100644
index 0e123ae0a..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ /dev/null
@@ -1,352 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HELPER_H
-#define ARM_COMPUTE_HELPER_H
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
- defined(cl_arm_integer_dot_product_accumulate_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
- // defined(cl_arm_integer_dot_product_accumulate_int8)
-
-#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
-#pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
-
-#define EXPAND(x) x
-
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-
-#define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
-
-#define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
-
-#define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-
-#define CL_VEC_DATA_TYPE_STR(type, size) type##size
-#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
-
-#define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
-
-#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
-
-#define VECTOR_DECLARATION(name) \
- __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
- uint name##_offset_first_element_in_bytes
-
-#define IMAGE_DECLARATION(name) \
- __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
- uint name##_step_y, uint name##_offset_first_element_in_bytes
-
-#define TENSOR3D_DECLARATION(name) \
- __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
- uint name##_step_y, uint name##_stride_z, uint name##_step_z, \
- uint name##_offset_first_element_in_bytes
-
-#define TENSOR4D_DECLARATION(name) \
- __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
- uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \
- uint name##_step_w, uint name##_offset_first_element_in_bytes
-
-#define CONVERT_TO_VECTOR_STRUCT(name) \
- update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
- name##_step_x)
-
-#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
- update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
-
-#define CONVERT_TO_IMAGE_STRUCT(name) \
- update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
- name##_step_x, name##_stride_y, name##_step_y)
-
-#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
- update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
- name##_stride_y, 0)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
- update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
- name##_stride_x, name##_step_x, name##_stride_y, \
- name##_step_y, name##_stride_z, name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
- update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
- name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
- name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
- update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
- name##_stride_x, name##_step_x, name##_stride_y, \
- name##_step_y, name##_stride_z, name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT(name) \
- update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
- name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
- name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
- update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
- 0, name##_stride_y, 0, name##_stride_z, 0)
-
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \
- update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
- name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
- name##_step_z, name##_stride_w, name##_step_w, mod_size)
-
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
- update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
- 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \
- mod_size)
-
-/** Structure to hold Vector information */
-typedef struct Vector
-{
- __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
- int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
- int stride_x; /**< Stride of the image in X dimension (in bytes) */
-} Vector;
-
-/** Structure to hold Image information */
-typedef struct Image
-{
- __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
- int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
- int stride_x; /**< Stride of the image in X dimension (in bytes) */
- int stride_y; /**< Stride of the image in Y dimension (in bytes) */
-} Image;
-
-/** Structure to hold 3D tensor information */
-typedef struct Tensor3D
-{
- __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
- int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
- int stride_x; /**< Stride of the image in X dimension (in bytes) */
- int stride_y; /**< Stride of the image in Y dimension (in bytes) */
- int stride_z; /**< Stride of the image in Z dimension (in bytes) */
-} Tensor3D;
-
-/** Structure to hold 4D tensor information */
-typedef struct Tensor4D
-{
- __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
- int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
- int stride_x; /**< Stride of the image in X dimension (in bytes) */
- int stride_y; /**< Stride of the image in Y dimension (in bytes) */
- int stride_z; /**< Stride of the image in Z dimension (in bytes) */
- int stride_w; /**< Stride of the image in W dimension (in bytes) */
-} Tensor4D;
-
-/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
- * data.
- *
- * @param[in] ptr Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
- * @param[in] stride_x Stride of the vector in X dimension (in bytes)
- * @param[in] step_x stride_x * number of elements along X processed per
- * workitem(in bytes)
- *
- * @return An image object
- */
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
- uint stride_x, uint step_x)
-{
- Vector vector = {
- .ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- };
- vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
- return vector;
-}
-
-/** Wrap image information into an Image structure, and make the pointer point at this workitem's
- * data.
- *
- * @param[in] ptr Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x Stride of the image in X dimension (in bytes)
- * @param[in] step_x stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y Stride of the image in Y dimension (in bytes)
- * @param[in] step_y stride_y * number of elements along Y processed per
- * workitem(in bytes)
- *
- * @return An image object
- */
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
- uint stride_x, uint step_x, uint stride_y, uint step_y)
-{
- Image img = {.ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y};
- img.ptr +=
- img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
- return img;
-}
-
-/** Wrap 3D tensor information into an image structure, and make the pointer point at this
- * workitem's data.
- *
- * @param[in] ptr Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x Stride of the image in X dimension (in bytes)
- * @param[in] step_x stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y Stride of the image in Y dimension (in bytes)
- * @param[in] step_y stride_y * number of elements along Y processed per
- * workitem(in bytes)
- * @param[in] stride_z Stride of the image in Z dimension (in bytes)
- * @param[in] step_z stride_z * number of elements along Z processed per
- * workitem(in bytes)
- *
- * @return A 3D tensor object
- */
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
- uint offset_first_element_in_bytes,
- uint stride_x, uint step_x, uint stride_y,
- uint step_y, uint stride_z, uint step_z)
-{
- Image img = {.ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y};
- img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
- get_global_id(1) * step_y + get_global_id(2) * step_z;
- return img;
-}
-
-/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
- * workitem's data.
- *
- * @param[in] ptr Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x Stride of the image in X dimension (in bytes)
- * @param[in] step_x stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y Stride of the image in Y dimension (in bytes)
- * @param[in] step_y stride_y * number of elements along Y processed per
- * workitem(in bytes)
- * @param[in] stride_z Stride of the image in Z dimension (in bytes)
- * @param[in] step_z stride_z * number of elements along Z processed per
- * workitem(in bytes)
- *
- * @return A 3D tensor object
- */
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
- uint offset_first_element_in_bytes, uint stride_x,
- uint step_x, uint stride_y, uint step_y, uint stride_z,
- uint step_z)
-{
- Tensor3D tensor = {.ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y,
- .stride_z = stride_z};
- tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
- get_global_id(1) * step_y + get_global_id(2) * step_z;
- return tensor;
-}
-
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
- uint offset_first_element_in_bytes, uint stride_x,
- uint step_x, uint stride_y, uint step_y, uint stride_z,
- uint step_z, uint stride_w, uint step_w, uint mod_size)
-{
- Tensor4D tensor = {.ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y,
- .stride_z = stride_z,
- .stride_w = stride_w};
-
- tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
- get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
- (get_global_id(2) / mod_size) * step_w;
- return tensor;
-}
-
-/** Get the pointer position of a Vector
- *
- * @param[in] vec Pointer to the starting position of the buffer
- * @param[in] x Relative X position
- */
-inline __global const uchar *vector_offset(const Vector *vec, int x)
-{
- return vec->ptr + x * vec->stride_x;
-}
-
-/** Get the pointer position of a Image
- *
- * @param[in] img Pointer to the starting position of the buffer
- * @param[in] x Relative X position
- * @param[in] y Relative Y position
- */
-inline __global uchar *offset(const Image *img, int x, int y)
-{
- return img->ptr + x * img->stride_x + y * img->stride_y;
-}
-
-/** Get the pointer position of a Tensor3D
- *
- * @param[in] tensor Pointer to the starting position of the buffer
- * @param[in] x Relative X position
- * @param[in] y Relative Y position
- * @param[in] z Relative Z position
- */
-inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
-{
- return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
-}
-
-/** Get the pointer position of a Tensor4D
- *
- * @param[in] tensor Pointer to the starting position of the buffer
- * @param[in] x Relative X position
- * @param[in] y Relative Y position
- * @param[in] z Relative Z position
- * @param[in] w Relative W position
- */
-inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
-{
- return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
- w * tensor->stride_w;
-}
-
-#endif // _HELPER_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
deleted file mode 100644
index c39138caa..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
-#define ARM_COMPUTE_HELPERS_ASYMM_H
-
-#include "helpers.h"
-
-/** Correctly-rounded-to-nearest division by a power-of-two.
- *
- * @param[in] size Size of vector.
- *
- * @return Correctly-rounded-to-nearest division by a power-of-two.
- */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
- { \
- VEC_DATA_TYPE(int, size) \
- mask = (1 << exponent) - 1; \
- const VEC_DATA_TYPE(int, size) zero = 0; \
- const VEC_DATA_TYPE(int, size) one = 1; \
- VEC_DATA_TYPE(int, size) \
- threshold = (mask >> 1) + select(zero, one, x < 0); \
- return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
- }
-
-/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
- * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
- *
- * @param[in] size Size of vector.
- *
- * @return Product of two fixed-point numbers.
- */
-#define ASYMM_MULT_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
- { \
- VEC_DATA_TYPE(int, size) \
- overflow = a == b && a == INT_MIN; \
- VEC_DATA_TYPE(long, size) \
- a_64 = convert_long##size(a); \
- VEC_DATA_TYPE(long, size) \
- b_64 = convert_long##size(b); \
- VEC_DATA_TYPE(long, size) \
- ab_64 = a_64 * b_64; \
- /* COMPMID-907 */ \
- VEC_DATA_TYPE(int, size) \
- ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \
- return select(ab_x2_high32, INT_MAX, overflow); \
- }
-
-/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
- a) \
- { \
- const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \
- const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \
- const int k_fractional_bits = 31; \
- VEC_DATA_TYPE(int, size) \
- x = a + (1 << (k_fractional_bits - 3)); \
- VEC_DATA_TYPE(int, size) \
- x2 = ASYMM_MULT(x, x, size); \
- VEC_DATA_TYPE(int, size) \
- x3 = ASYMM_MULT(x2, x, size); \
- VEC_DATA_TYPE(int, size) \
- x4 = ASYMM_MULT(x2, x2, size); \
- VEC_DATA_TYPE(int, size) \
- x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \
- VEC_DATA_TYPE(int, size) \
- x4_over_24_plus_x3_over_6_plus_x2 = \
- ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \
- VEC_DATA_TYPE(int, size) \
- x4_over_24_plus_x3_over_6_plus_x2_over_2 = \
- ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \
- return constant_term + \
- ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
- }
-
-/** Each bit of the result is set to the corresponding bit of either then_val or
- * else_val depending on whether the corresponding bit of if_mask is set.
- * Equivalent to the VBSL instruction in ARM NEON.
- *
- * @param[in] size Size of vector.
- *
- * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding
- * bit in @p if_mask is set or not.
- */
-#define ASYMM_SELECT_USING_MASK_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \
- VEC_DATA_TYPE(int, size) then_val, \
- VEC_DATA_TYPE(int, size) else_val) \
- { \
- return (if_mask & then_val) ^ (~if_mask & else_val); \
- }
-
-/** For each element of input vector, the corresponding bits of the result item are set
- * if the input item is zero.
- *
- * @param[in] size Size of vector.
- *
- * @returns Output vector with bits set when corresponding bit in @p a is zero.
- */
-#define ASYMM_MASK_IF_ZERO_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
- { \
- const VEC_DATA_TYPE(int, size) all_zeros = 0; \
- const VEC_DATA_TYPE(int, size) all_ones = ~0; \
- return select(all_zeros, all_ones, a == 0); \
- }
-
-/** For each element of input vector, the corresponding bits of the result item are set
- * if the input item is non-zero.
- *
- * @param[in] size Size of vector.
- *
- * @returns Output vector with bits set when corresponding bit in @p a is non zero.
- */
-#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
- { \
- const VEC_DATA_TYPE(int, size) all_zeros = 0; \
- const VEC_DATA_TYPE(int, size) all_ones = ~0; \
- return select(all_zeros, all_ones, a != 0); \
- }
-
-#define EXP_BARREL_SHIFTER_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \
- VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
- int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
- { \
- if (k_integer_bits > exponent) \
- { \
- const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
- return ASYMM_SELECT_USING_MASK( \
- ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \
- ASYMM_MULT(result, fp_multiplier, size), result, size); \
- } \
- \
- return result; \
- }
-
-/** Calculates \f$ exp(x) \f$ for x < 0.
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
- { \
- const int k_fractional_bits = 31 - k_integer_bits; \
- VEC_DATA_TYPE(int, size) \
- k_one_quarter = 1 << (k_fractional_bits - 2); \
- VEC_DATA_TYPE(int, size) \
- mask = k_one_quarter - 1; \
- VEC_DATA_TYPE(int, size) \
- a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \
- VEC_DATA_TYPE(int, size) \
- a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \
- VEC_DATA_TYPE(int, size) \
- result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \
- a_mod_quarter_minus_one_quarter_scaled, size); \
- VEC_DATA_TYPE(int, size) \
- remainder = a_mod_quarter_minus_one_quarter - a; \
- \
- result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \
- remainder, size); \
- result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \
- remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \
- remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \
- remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \
- remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
- size); \
- result = \
- EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \
- \
- if (k_integer_bits > 5) \
- { \
- const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \
- result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
- } \
- \
- const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
- return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \
- }
-
-/** Calculates the product of a integer value by a power of two, with either a positive exponent
- * (equivalent to an arithmetic left shift, saturating) or a negative exponent
- * (equivalent to an arithmetic right shift, rounding to nearest).
- *
- * @param[in] size Size of vector.
- *
- * @return Arithmetic left or right shift.
- */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
- { \
- if (exponent < 0) \
- { \
- return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \
- } \
- \
- const VEC_DATA_TYPE(int, size) min = INT_MIN; \
- const VEC_DATA_TYPE(int, size) max = INT_MAX; \
- int threshold = ((1 << (31 - exponent)) - 1); \
- VEC_DATA_TYPE(int, size) \
- positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \
- VEC_DATA_TYPE(int, size) \
- negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \
- VEC_DATA_TYPE(int, size) \
- result = x << exponent; \
- result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \
- result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \
- return result; \
- }
-
-/** Calculates (a+b)/2, rounded to the nearest integer.
- * Equivalent to VRHADD in the ARM NEON instruction set.
- *
- * @param[in] size Size of vector.
- *
- * @return (a+b)/2, rounded to the nearest integer.
- */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
- { \
- VEC_DATA_TYPE(long, size) \
- a64 = convert_long##size(a); \
- VEC_DATA_TYPE(long, size) \
- b64 = convert_long##size(b); \
- VEC_DATA_TYPE(long, size) \
- sum = a64 + b64; \
- const VEC_DATA_TYPE(long, size) one = 1; \
- const VEC_DATA_TYPE(long, size) minus_one = -1; \
- VEC_DATA_TYPE(long, size) \
- sign = select(minus_one, one, sum >= 0); \
- return convert_int##size((sum + sign) / 2); \
- }
-
-/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
- { \
- const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
- const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \
- VEC_DATA_TYPE(int, size) \
- half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \
- const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \
- const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \
- VEC_DATA_TYPE(int, size) \
- x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
- for (int i = 0; i < 3; i++) \
- { \
- VEC_DATA_TYPE(int, size) \
- half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \
- VEC_DATA_TYPE(int, size) \
- one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \
- VEC_DATA_TYPE(int, size) \
- tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \
- x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \
- } \
- return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \
- }
-
-/** Considering the integer value as fixed-point, change the number of integer bits and update value
- * accordingly.
- *
- * @param[in] size Size of vector.
- *
- * @return Rescaled value.
- */
-#define ASYMM_RESCALE_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \
- int src_integer_bits, int dst_integer_bits) \
- { \
- int exponent = src_integer_bits - dst_integer_bits; \
- return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \
- }
-
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
- asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
- ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
- asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
- asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
-#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
- remainder, size) \
- exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
- remainder)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \
- asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \
- asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
- asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
-#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
- asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
-
-ASYMM_MULT_IMPL(2)
-ASYMM_MULT_IMPL(4)
-ASYMM_MULT_IMPL(8)
-ASYMM_MULT_IMPL(16)
-
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
-
-ASYMM_SELECT_USING_MASK_IMPL(2)
-ASYMM_SELECT_USING_MASK_IMPL(4)
-ASYMM_SELECT_USING_MASK_IMPL(8)
-ASYMM_SELECT_USING_MASK_IMPL(16)
-
-ASYMM_MASK_IF_ZERO_IMPL(2)
-ASYMM_MASK_IF_ZERO_IMPL(4)
-ASYMM_MASK_IF_ZERO_IMPL(8)
-ASYMM_MASK_IF_ZERO_IMPL(16)
-
-ASYMM_MASK_IF_NON_ZERO_IMPL(2)
-ASYMM_MASK_IF_NON_ZERO_IMPL(4)
-ASYMM_MASK_IF_NON_ZERO_IMPL(8)
-ASYMM_MASK_IF_NON_ZERO_IMPL(16)
-
-EXP_BARREL_SHIFTER_IMPL(2)
-EXP_BARREL_SHIFTER_IMPL(4)
-EXP_BARREL_SHIFTER_IMPL(8)
-EXP_BARREL_SHIFTER_IMPL(16)
-
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
-
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
-
-ASYMM_ROUNDING_HALF_SUM_IMPL(2)
-ASYMM_ROUNDING_HALF_SUM_IMPL(4)
-ASYMM_ROUNDING_HALF_SUM_IMPL(8)
-ASYMM_ROUNDING_HALF_SUM_IMPL(16)
-
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
-
-ASYMM_RESCALE_IMPL(2)
-ASYMM_RESCALE_IMPL(4)
-ASYMM_RESCALE_IMPL(8)
-ASYMM_RESCALE_IMPL(16)
-
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H \ No newline at end of file
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
deleted file mode 100644
index e3aa463db..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Performs a negation of input tensor.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: S16/S32/F16/F32.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
- */
-__kernel void neg_tensor(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VSTORE(VEC_SIZE)
- (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
deleted file mode 100644
index ecf4696e9..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
- * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- *
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu
-t_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- *
- * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for
- * batch), Top(for height), Left(for width) and Front(for depth) are
- * required. Supported data type: S32
- */
-
-__kernel void pad(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- const int4 pad_values)
- {
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int index[4]={0};
-
- index[0] = get_global_id(0);//W
- index[1] = get_global_id(1);//H
- index[2] = get_global_id(2) % DEPTH_OUT;//C
- index[3] = get_global_id(2) / DEPTH_OUT;//N
-
- if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) ||
- index[1] < pad_values.y || index[1] >= (IH + pad_values.y) ||
- index[2] < pad_values.z || index[2] >= (ID + pad_values.z) ||
- index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
- {
- *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
- }
- else
- {
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)
- tensor4D_offset(&in, index[0] - pad_values.x,
- index[1] - pad_values.y,
- index[2] - pad_values.z,
- index[3] - pad_values.w));
- }
- }
-
-#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
deleted file mode 100644
index 7cc8b0354..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
-/** Perform a Generic permute operation on an input tensor of Shape DCHW.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1
-6/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in b
-ytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in b
-ytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in b
-ytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_generic(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- int out_index[4];
- int in_index[4];
- in_index[0] = get_global_id(0);//W
- in_index[1] = get_global_id(1);//H
- in_index[2] = get_global_id(2) % DEPTH_IN;//C
- in_index[3] = get_global_id(2) / DEPTH_IN;//B
- out_index[0] = in_index[P1];
- out_index[1] = in_index[P2];
- out_index[2] = in_index[P3];
- out_index[3] = in_index[P4];
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
deleted file mode 100644
index aa05121b1..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else /* SATURATE */
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif /* SATURATE */
-#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
-
-/** Performs a pixelwise division with float scale of either integer or float inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
- * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
- *
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] scale Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_div_float(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out),
- const float scale)
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load data
- VEC_DATA_TYPE(DATA_TYPE_RES, 16)
- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
- VEC_DATA_TYPE(DATA_TYPE_RES, 16)
- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-
- // Perform division
-#ifdef DATA_TYPE_FLOAT
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-#else /* DATA_TYPE_FLOAT */
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
-#endif /* DATA_TYPE_FLOAT */
-
- // Store result
- vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
deleted file mode 100644
index fdfb78003..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(SATURATE)
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
-#else // SATURATE
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
-#endif // SATURATE
-#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
-
-#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
-
-/** Performs a pixelwise division with integer scale of integer inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
- * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- *
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/S16
- * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] scale Integer scaling factor. Supported data types: S32
- */
-__kernel void pixelwise_div_int(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out),
- const uint scale)
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load data
- VEC_DATA_TYPE(DATA_TYPE_RES, 16)
- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
- VEC_DATA_TYPE(DATA_TYPE_RES, 16)
- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-
- // Perform division and store result
- vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
deleted file mode 100644
index ab1307e64..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else /* SATURATE */
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif /* SATURATE */
-#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
-
-#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
-/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * The following computations will be performed by the kernel:
- *
- * -# Add offset terms to inputs
- * -# Multiply inputs
- * -# Add offset terms to final result
- * -# Multiply each entry of result by result_mult_int
- * -# Shift the int32 accumulator by result_shift
- * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8
- * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] in2_ptr Pointer to the source image. Supported data types: U8
- * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] scale Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_mul_qasymm8(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out),
- const float scale)
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load data
- VEC_DATA_TYPE(int, 16)
- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
- VEC_DATA_TYPE(int, 16)
- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
- // Perform multiplication of two inputs
- VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
- VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
- VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
-
- // Multiply with a multiplier smaller than 1
- out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
- out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
- VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
-// TODO: Apply min-max BOUND to support fuse with relu.
-/*
-#if defined(MIN_BOUND)
- res = max(res, (uchar16)MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
- res = min(res, (uchar16)MAX_BOUND);
-#endif // defined(MAX_BOUND)
-*/
-
- // Store result
- VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
- 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
deleted file mode 100644
index 68da2ba32..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in] input1_ptr Pointer to the source image. Supported Data types : F16/F32
- * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image
- *
- * @param[in] alpha_ptr Pointer to the source image. Supported Data types : F16/F32
- * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image
- *
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void prelu(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(alpha),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VSTORE(VEC_SIZE)
- (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ?
- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) :
- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
- 0, (__global DATA_TYPE *)output.ptr);
-
-}
-#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
deleted file mode 100644
index 7e97b7ed6..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take uchar data types.
- *
- * @param[in] input1_ptr Pointer to the source image. Supported Data types : QASYMM8
- * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image
- *
- * @param[in] alpha_ptr Pointer to the source image. Supported Data types : QASYMM8
- * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image
- *
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void prelu_qasymm8(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(alpha),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
- VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
-
- in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1));
- in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2));
-
- const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
- const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
- const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32;
- const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
- const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
- VSTORE(VEC_SIZE)
- (res, 0, (__global uchar *)output.ptr);
-}
-
-#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
deleted file mode 100644
index 8bef49363..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform reduce max/min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] axis Axis through which reduction occurs
- * @param[in] dim Dimension across the axis to be reduced.
- */
-__kernel void reduce_min_max(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- const int axis,
- const int dim)
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int indices[4] =
- {
- get_global_id(0),
- get_global_id(1),
- get_global_id(2) % DEPTH_OUT,
- get_global_id(2) / DEPTH_OUT,
- };
-
- DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
- for(int i = 1; i < dim; ++i)
- {
- indices[axis] = i;
-
- #if OP_CODE == 1 // REDUCE_MAX
- value = max(value, *((__global DATA_TYPE *)
- tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
-
- #elif OP_CODE == 2 // REDUCE_MIN
- value = min(value, *((__global DATA_TYPE *)
- tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
-
- #else // OP NOT SUPPORTED
- return;
-
- #endif
- }
-
- *((__global DATA_TYPE *)out.ptr) = value;
-}
-
-/** Perform reduce sum/mean
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] axis Axis through which reduction occurs
- * @param[in] dim Dimension across the axis to be reduced.
- */
-__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- const int axis,
- const int dim)
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int indices[4] =
- {
- get_global_id(0),
- get_global_id(1),
- get_global_id(2) % DEPTH_OUT,
- get_global_id(2) / DEPTH_OUT,
- };
-
- DATA_TYPE sum_value = (DATA_TYPE)0;
- for(int i = 0; i < dim; ++i)
- {
- indices[axis] = i;
- sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
- }
-
- #if OP_CODE == 3 // REDUCE_SUM
- *((__global DATA_TYPE *)out.ptr) = sum_value;
-
- #elif OP_CODE == 4 // REDUCE_MEAN
- *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
-
- #else // OP NOT SUPPORTED
- return;
-
- #endif
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
deleted file mode 100644
index a0fc2d5a9..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
-/** Perform space to batch with input of 4D and NCHW format
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
- * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
- * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
- * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32
- * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32
- * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- VECTOR_DECLARATION(block_size),
- IMAGE_DECLARATION(padding_size))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int block_size_x = *((__global int *)(block_size_ptr));
- int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
- int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
- int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
-
- int in_index[4] = {0, };
- in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
- in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
- in_index[2] = get_global_id(2) % DEPTH_OUT;
- in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
-
- if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
- {
- *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
- }
- else
- {
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
- }
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
-
-#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
-/** Perform space to batch with input of 4D and NHWC format
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
- * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
- * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
- * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
- * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32
- * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32
- * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- VECTOR_DECLARATION(block_size),
- IMAGE_DECLARATION(padding_size))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
-
- int block_size_x = *((__global int *)(block_size_ptr));
- int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
- int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
- int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
-
- int in_index[4] = {0, };
- in_index[0] = get_global_id(0) * VEC_SIZE;
- in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
- in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
- in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
-
- if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
- {
- VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
- }
- else
- {
- VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
- 0, (__global DATA_TYPE *)out.ptr);
- }
-}
-
-#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index f6977045a..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_depth(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
- {
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- int out_index[4]={0};
- int in_index[4]={0};
-
- in_index[0] = get_global_id(0);//W
- in_index[1] = get_global_id(1);//H
- in_index[2] = get_global_id(2) % DEPTH_IN;//C
- in_index[3] = get_global_id(2) / DEPTH_IN;//B
-
- out_index[0] = in_index[0]/BLOCK_SIZE;
- out_index[1] = in_index[1]/BLOCK_SIZE;
- out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
- out_index[3] = in_index[3];
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr);
- }
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
deleted file mode 100644
index 3e1a5c97f..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns true value of squared_difference of two tensors.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in] input1_ptr Pointer to the source image. Supported data types: F16/F32
- * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image
- *
- * @param[in] input2_ptr Pointer to the source image. Supported data types: F16/F32
- * @param[in] input2_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input2_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source image
- *
- * @param[out] output_ptr Pointer to the destination image. Supported data types: F16/F32
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void squared_difference(
- TENSOR3D_DECLARATION(input1),
- TENSOR3D_DECLARATION(input2),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
- Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr);
-
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- sq_diff = diff * diff;
-
- VSTORE(VEC_SIZE)
- (sq_diff, 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
deleted file mode 100644
index b39c55b96..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
-/** Extracts a strided slice up to 4-dimensions
- *
- * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32
- * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32
- */
-__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- const int4 starts,
- const int4 strides)
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int4 indices_in =
- {
- starts.x + (strides.x * get_global_id(0)),
- starts.y + (strides.y * get_global_id(1)),
- starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)),
- starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)),
- };
- *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
-}
-#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
deleted file mode 100644
index d97f23a47..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.h"
-
-__kernel void topkv2_init(VECTOR_DECLARATION(input),
- __global float* in_key_buf,
- __global int* in_ind_buf,
- const int n)
-{
- int gid = get_global_id(0);
- int lws = get_local_size(0);
- int groups = get_num_groups(0);
- int gws = lws * groups;
- int iter = n / gws;
-
- Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
-
- for(int i = 0; i < iter; ++i)
- {
- int idx = i * gws + gid;
- in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x);
- in_ind_buf[idx] = idx;
- }
-}
-
-__kernel void topkv2_find_first_negative(
- __global float *out_key_buf,
- __global int *first_negative_idx,
- int n)
-{
- int gid = get_global_id(0);
-
- if( gid == n - 1 )
- {
- // if the last item is positive, the first negative index is n.
- if( out_key_buf[gid] > 0.f )
- *first_negative_idx = n;
- } else if ( gid == 0 ) {
- // if the first item is negative, set it 0.
- if( out_key_buf[gid] < 0.f )
- *first_negative_idx = 0;
- } else {
- // if its left is positive and it is negative, then it is the first negative item.
- if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f )
- *first_negative_idx = gid;
- }
-}
-
-__kernel void topkv2_reorder_negatives(
- __global float* in_key_buf,
- __global float* out_key_buf,
- __global float* in_ind_buf,
- __global float* out_ind_buf,
- __global int* first_negative_idx,
- int n)
-{
- int gid = get_global_id(0);
-
- int num_negs = n - *first_negative_idx;
- int in_idx;
-
- if( gid < num_negs ) {
- in_idx = n - 1 - gid;
- } else {
- in_idx = gid - num_negs;
- }
-
- out_key_buf[gid] = in_key_buf[in_idx];
- out_ind_buf[gid] = in_ind_buf[in_idx];
-}
-
-__kernel void topkv2_store(
- VECTOR_DECLARATION(values),
- VECTOR_DECLARATION(indices),
- __global float *out_key_buf,
- __global int *out_ind_buf,
- int n)
-{
- int gid = get_global_id(0);
-
- Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
- Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
-
- int idx = n - 1 - gid;
-
- *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
- *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
deleted file mode 100644
index 0292fab04..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.h"
-
-__global inline float* get_vec_elem(Vector* vec, int idx)
-{
- return (__global float*)(vec->ptr + idx * vec->stride_x);
-}
-
-__global inline int* get_vec_elem_int(Vector* vec, int idx)
-{
- return (__global int*)(vec->ptr + idx * vec->stride_x);
-}
-
-// A utility function to swap two elements
-void swap(__global float *a, __global float *b)
-{
- float t = *a;
- *a = *b;
- *b = t;
-}
-
-void swap_idx(__global int *a, __global int *b)
-{
- int t = *a;
- *a = *b;
- *b = t;
-}
-
-/* This function is same in both iterative and recursive*/
-int partition (Vector* arr, __global int* indices, int l, int h)
-{
- float x = *get_vec_elem(arr, h);
- int i = (l - 1);
-
- for (int j = l; j <= h- 1; j++)
- {
- if (*get_vec_elem(arr, j) >= x)
- {
- i++;
- swap (get_vec_elem(arr,i), get_vec_elem(arr,j));
- swap_idx(&indices[i], &indices[j]);
- }
- }
- swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
- swap_idx(&indices[i + 1], &indices[h]);
- return (i + 1);
-}
-
-/* A[] --> Array to be sorted,
- l --> Starting index,
- h --> Ending index */
-void quickSortIterative (Vector* arr, __global int* indices,
- __global int *stack, int l, int h)
-{
- // Create an auxiliary stack
-
- // initialize top of stack
- int top = -1;
-
- // push initial values of l and h to stack
- stack[ ++top ] = l;
- stack[ ++top ] = h;
-
- // Keep popping from stack while is not empty
- while ( top >= 0 )
- {
- // Pop h and l
- h = stack[ top-- ];
- l = stack[ top-- ];
-
- // Set pivot element at its correct position
- // in sorted array
- int p = partition( arr, indices, l, h );
-
- // If there are elements on left side of pivot,
- // then push left side to stack
- if ( p-1 > l )
- {
- stack[ ++top ] = l;
- stack[ ++top ] = p - 1;
- }
-
- // If there are elements on right side of pivot,
- // then push right side to stack
- if ( p+1 < h )
- {
- stack[ ++top ] = p + 1;
- stack[ ++top ] = h;
- }
- }
-}
-
-__kernel void topkv2_quicksort(VECTOR_DECLARATION(input),
- VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices),
- __global int* indices, __global int* temp_stack, int k, int n)
-{
- Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
- Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
- Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
-
- for( int i = 0; i < n; ++i )
- {
- indices[i] = i;
- }
-
- quickSortIterative(&input, indices, temp_stack, 0, n-1);
-
- // extract k items.
- for(int i = 0; i < k; ++i)
- {
- *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i);
- *get_vec_elem_int(&topk_indices, i) = indices[i];
- }
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
deleted file mode 100644
index c2c2d89a4..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// reference:
-// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
-// OpenCL kernel sources for the CLRadixSort class
-// the #include does not exist in OpenCL
-// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
-// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
-// if you find this software usefull you can cite the following work in your reports or articles:
-// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
-// http://hal.archives-ouvertes.fr/hal-00596730
-
-// Reference for floating point radix sort:
-// http://www.codercorner.com/RadixSortRevisited.htm
-
-// compute the histogram for each radix and each virtual processor for the pass
-__kernel void radixsort_histogram(__global float* in_key_buf,
- __global int* d_Histograms,
- const int pass,
- __local int* loc_histo,
- const int n)
-{
- int it = get_local_id(0); // i local number of the processor
- int ig = get_global_id(0); // global number = i + g I
-
- int gr = get_group_id(0); // g group number
-
- int groups = get_num_groups(0);
- int items = get_local_size(0);
-
- // set the local histograms to zero
- for(int ir=0;ir<_RADIX;ir++){
- loc_histo[ir * items + it] = 0;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // range of keys that are analyzed by the work item
- int size= n/groups/items; // size of the sub-list
- int start= ig * size; // beginning of the sub-list
-
- unsigned int key;
- int shortkey,k;
-
- // compute the index
- // the computation depends on the transposition
- for(int j = 0; j < size ; j++) {
-#ifdef TRANSPOSE
- k= groups * items * j + ig;
-#else
- k=j+start;
-#endif
-
- key = *((__global unsigned int*)(in_key_buf + k));
-
- // extract the group of _BITS bits of the pass
- // the result is in the range 0.._RADIX-1
- shortkey=(( key >> (pass * _BITS)) & (_RADIX-1));
-
- // increment the local histogram
- loc_histo[shortkey * items + it ]++;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // copy the local histogram to the global one
- for(int ir=0;ir<_RADIX;ir++) {
- d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
- }
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// initial transpose of the list for improving
-// coalescent memory access
-__kernel void transpose(const __global int* invect,
- __global int* outvect,
- const int nbcol,
- const int nbrow,
- const __global int* inperm,
- __global int* outperm,
- __local int* blockmat,
- __local int* blockperm,
- const int tilesize){
-
- int i0 = get_global_id(0)*tilesize; // first row index
- int j = get_global_id(1); // column index
-
- int jloc = get_local_id(1); // local column index
-
- // fill the cache
- for(int iloc=0;iloc<tilesize;iloc++){
- int k=(i0+iloc)*nbcol+j; // position in the matrix
- blockmat[iloc*tilesize+jloc]=invect[k];
-#ifdef PERMUT
- blockperm[iloc*tilesize+jloc]=inperm[k];
-#endif
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // first row index in the transpose
- int j0=get_group_id(1)*tilesize;
-
- // put the cache at the good place
- for(int iloc=0;iloc<tilesize;iloc++){
- int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose
- outvect[kt]=blockmat[jloc*tilesize+iloc];
-#ifdef PERMUT
- outperm[kt]=blockperm[jloc*tilesize+iloc];
-#endif
- }
-
-}
-
-// each virtual processor reorders its data using the scanned histogram
-__kernel void radixsort_reorder(__global float* in_key,
- __global float* out_key,
- __global int* d_Histograms,
- const int pass,
- __global int* indices_in,
- __global int* indices_out,
- __local int* loc_histo,
- const int n){
-
- int it = get_local_id(0);
- int ig = get_global_id(0);
-
- int gr = get_group_id(0);
- int groups=get_num_groups(0);
- int items=get_local_size(0);
-
- int start= ig *(n/groups/items);
- int size= n/groups/items;
-
- // take the histogram in the cache
- for(int ir=0;ir<_RADIX;ir++){
- loc_histo[ir * items + it]=
- d_Histograms[items * (ir * groups + gr) + it];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int newpos,shortkey,k,newpost;
- unsigned int key;
-
- for(int j= 0; j< size;j++){
-#ifdef TRANSPOSE
- k= groups * items * j + ig;
-#else
- k=j+start;
-#endif
- float org_value = in_key[k];
- key = *(__global unsigned int*)(in_key + k);
- shortkey=((key >> (pass * _BITS)) & (_RADIX-1));
-
- newpos=loc_histo[shortkey * items + it];
-
-#ifdef TRANSPOSE
- int ignew,jnew;
- ignew= newpos/(n/groups/items);
- jnew = newpos%(n/groups/items);
- newpost = jnew * (groups*items) + ignew;
-#else
- newpost=newpos;
-#endif
-
- //d_outKeys[newpost]= key; // killing line !!!
- out_key[newpost] = org_value;
-
-#ifdef PERMUT
- indices_out[newpost] = indices_in[k];
-#endif
-
- newpos++;
- loc_histo[shortkey * items + it]=newpos;
- }
-}
-
-// perform a parallel prefix sum (a scan) on the local histograms
-// (see Blelloch 1990) each workitem worries about two memories
-// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
-__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum)
-{
- int it = get_local_id(0);
- int ig = get_global_id(0);
- int decale = 1;
- int n=get_local_size(0) * 2 ;
- int gr=get_group_id(0);
-
- // load input into local memory
- // up sweep phase
- temp[2*it] = histo[2*ig];
- temp[2*it+1] = histo[2*ig+1];
-
- // parallel prefix sum (algorithm of Blelloch 1990)
- for (int d = n>>1; d > 0; d >>= 1){
- barrier(CLK_LOCAL_MEM_FENCE);
- if (it < d){
- int ai = decale*(2*it+1)-1;
- int bi = decale*(2*it+2)-1;
- temp[bi] += temp[ai];
- }
- decale *= 2;
- }
-
- // store the last element in the global sum vector
- // (maybe used in the next step for constructing the global scan)
- // clear the last element
- if (it == 0) {
- globsum[gr]=temp[n-1];
- temp[n - 1] = 0;
- }
-
- // down sweep phase
- for (int d = 1; d < n; d *= 2){
- decale >>= 1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (it < d){
- int ai = decale*(2*it+1)-1;
- int bi = decale*(2*it+2)-1;
-
- int t = temp[ai];
- temp[ai] = temp[bi];
- temp[bi] += t;
- }
-
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // write results to device memory
-
- histo[2*ig] = temp[2*it];
- histo[2*ig+1] = temp[2*it+1];
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-
-}
-
-// use the global sum for updating the local histograms
-// each work item updates two values
-__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum)
-{
- int ig = get_global_id(0);
- int gr=get_group_id(0);
-
- int s;
-
- s=globsum[gr];
-
- // write results to device memory
- histo[2*ig] += s;
- histo[2*ig+1] += s;
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}