diff options
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/cl_kernels')
32 files changed, 0 insertions, 3689 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl deleted file mode 100644 index f54c7bde3..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - -#define CONST_ONE 1.f -#define DIV_OP(a, b) ((a) / (b)) -#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a))) - -// Inverse Square-root Activation -inline TYPE rsqrt_op(TYPE x) -{ - return RSQRT_OP(x); -} - -#define ACTIVATION_OP2(op, x) op##_op(x) -#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) - -#if defined(ACT) - -/** This performs an activation function floating point inputs. - * - * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH - * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void activation_layer_ex( - TENSOR3D_DECLARATION(input) -#ifndef IN_PLACE - , - TENSOR3D_DECLARATION(output) -#endif /* not IN_PLACE */ -) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); -#ifdef IN_PLACE - Tensor3D output = input; -#else /* IN_PLACE */ - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); -#endif /* IN_PLACE */ - - // Load data - TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr); - - // Perform activation - data = ACTIVATION_OP(ACT, data); - - // Store result - VSTORE(VEC_SIZE) - (data, 0, (__global DATA_TYPE *)output.ptr); -} - -#endif /* defined(ACT) */ diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl deleted file mode 100644 index 9a6921d7c..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) -/** Perform arg_max/arg_min - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: U32 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] axis Axis through which reduction occurs for max value index - * @param[in] dim Dimension across the axis to be reduced. - */ - -__kernel void arg_op(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - DATA_TYPE tval = value; - int idx = 0; - for(int i = 1; i < dim; ++i) - { - indices[axis] = i; - - #if OP_CODE == 1 // ArgMax - value = max(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - #elif OP_CODE == 2 //ArgMin - value = min(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - #else - return; - - #endif - - if(tval!=value) - { - idx = indices[axis]; - tval = value; - } - } - - *((__global uint *)out.ptr) = idx; -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl deleted file mode 100644 index 2ed698951..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifdef SATURATE -#define SUB(x, y) sub_sat((x), (y)) -#else /* SATURATE */ -#define SUB(x, y) (x) - (y) -#endif /* SATURATE */ - -/** This function subtracts one tensors from another. - * - * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. - * - * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16 - * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16 - * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16 - * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void arithmetic_sub_ex( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load values - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); - - // Calculate and store result - vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl deleted file mode 100644 index 5cd0a4309..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers_asymm.h" - -#ifdef SATURATE -#define ADD(x, y) add_sat((x), (y)) -#define SUB(x, y) sub_sat((x), (y)) -#else /* SATURATE */ -#define ADD(x, y) (x) + (y) -#define SUB(x, y) (x) - (y) -#endif /* SATURATE */ - -/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 - * - * The following computations will be performed: - * - * -# Add offset terms to inputs - -# Get scaled value of two inputs - * -# Add inputs - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Shift the int32 accumulator by result_shift - * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. - * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT - * - * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: - * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f - * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: - * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. - * - * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] in2_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] out_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void arithmetic_add_qasymm8( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(int, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); - VEC_DATA_TYPE(int, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); - - // Get scaled value of two inputs - VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); - VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); - - VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); - VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; - VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; - - VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); - VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); - - // Add inputs and multiply with a multiplier smaller than 1 - VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; - VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); - out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); - - VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); - -// TODO: Apply min-max BOUND to support fuse with relu. -/* -#if defined(MIN_BOUND) - res = max(res, (uchar16)MIN_BOUND); -#endif // defined(MIN_BOUND) -#if defined(MAX_BOUND) - res = min(res, (uchar16)MAX_BOUND); -#endif // defined(MAX_BOUND) -*/ - - // Store result - VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), - 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl deleted file mode 100644 index ad6a48a02..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) -/** Perform batch to space rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void batch_to_space_nd( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int out_index[4]={0}; - int in_index[4]={0}; - - out_index[0] = get_global_id(0);//W - out_index[1] = get_global_id(1);//H - out_index[2] = get_global_id(2) % DEPTH_OUT;//C - out_index[3] = get_global_id(2) / DEPTH_OUT;//N - - in_index[0] = out_index[0]/BLOCK_SIZE1; - in_index[1] = out_index[1]/BLOCK_SIZE0; - in_index[2] = out_index[2]; - in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); - } -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl deleted file mode 100644 index bea61f53e..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(OP_CODE) && defined(DATA_TYPE) -/** returns truth value of the two input tensors for BINARY LOGICAL OP. - * where BINARY LOGICAL OP can be AND, OR. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[in] input2_ptr Pointer to the source tensor.Supported data types: QASYMM8 - * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - */ -__kernel void binary_logical_op( - TENSOR3D_DECLARATION(input1), - TENSOR3D_DECLARATION(input2), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); - Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - #if OP_CODE == 1 // LOGICAL AND - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); - - #elif OP_CODE == 2 // LOGICAL OR - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); - - #else // OP NOT SUPPORTED - return - - #endif -} -#endif //if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl deleted file mode 100644 index 3d4675e5d..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef SCALE -#define SCALE 1.0f -#endif -#ifndef OFFSET -#define OFFSET 0 -#endif -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) -/** Perform a cast operation on an input tensor. - * - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void cast( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); -} - -/** Perform a cast operation on an QASYMM8 input tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void cast_qasymm_in( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - - VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; - - VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); -} - - -/** Perform a cast operation on an QASYMM8 output tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void cast_qasymm_out( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - - VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); - - VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); -} -#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl deleted file mode 100644 index 765072556..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) -/** Returns truth value of comparison operators. - * Comparison operators may be equal, not_equal etc. - * - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT, - * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input1_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[in] input2_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void comparison_op( - TENSOR3D_DECLARATION(input1), - TENSOR3D_DECLARATION(input2), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); - Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - #if OP_CODE == 1 //EQUAL - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr); - - #elif OP_CODE == 2 //NOT_EQUAL - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); - - #else // OP NOT SUPPORTED - return; - - #endif -} -#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl deleted file mode 100644 index 1eb305f7b..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" -#define SUB(x, y) (x) - (y) - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) - -#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) -#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) - -/** Returns the truth value of comparison . - * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[in] input2_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void comparison_op_qasymm8( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT); - VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT); - - in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1)); - in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2)); - - const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); - const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); - - #if OPCODE == 1 //EQUAL QUANTIZED - VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); - - #elif OPCODE == 2 //NOT EQUAL QUANTIZED - VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); - - #else // OP NOT SUPPORTED - return; - - #endif -} -#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl deleted file mode 100644 index fef2243e7..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void depth_to_space( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int out_index[4]={0}; - int in_index[4]={0}; - - out_index[0] = get_global_id(0);//W - out_index[1] = get_global_id(1);//H - out_index[2] = get_global_id(2) % DEPTH_OUT;//C - out_index[3] = get_global_id(2) / DEPTH_OUT;//B - - in_index[0] = out_index[0]/BLOCK_SIZE; - in_index[1] = out_index[1]/BLOCK_SIZE; - in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; - in_index[3] = out_index[3]; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); - } -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl deleted file mode 100644 index 348458fe9..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) -/** Perform embedding_lookup of input tensor - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 - * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector - */ - -__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(lookups)) -{ - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); - - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - - //lookup ids for based on the tensor dimensions - int lup_id[4] = {0}; - - lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) - :get_global_id(0); - lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) - :get_global_id(1); - lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) - :get_global_id(2)%DEPTH_OUT; - lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - :get_global_id(2) / DEPTH_OUT; - - in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y - + lup_id[2] * input_step_z + lup_id[3] * input_step_w; - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl deleted file mode 100644 index 69d94f30a..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Perform an exponential operation on an input tensor. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Can only take floating point data types. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void exp_layer( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl deleted file mode 100644 index 6b767d6c9..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -/** Perform gather - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * - * @param[in] input1_ptr Pointer to the first source tensor. Supported data types: U8/S32/F32 - * @param[in] input1_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input1_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[in] input2_ptr Pointer to the first source tensor. Supported data types: U32 - * @param[in] input2_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input2_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void gather(IMAGE_DECLARATION(input1), - VECTOR_DECLARATION(input2), - IMAGE_DECLARATION(output)) -{ - Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1); - Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); - Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output); - - VEC_DATA_TYPE(DATA_TYPE_IN2, 2) - in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); - - //TODO: performance tuning for memcopy - int index = in2_data.s0; - int stride=input1_stride_y/input1_stride_x; - - for(int i=0; i<stride; i++){ - *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index)); - } -} - -__kernel void gather_1d_out(IMAGE_DECLARATION(input1), - VECTOR_DECLARATION(input2), - VECTOR_DECLARATION(output)) -{ - Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1); - Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); - Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); - - VEC_DATA_TYPE(DATA_TYPE_IN2, 2) - in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); - - //TODO: performance tuning for memcopy - int index = in2_data.s0; - int stride=input1_stride_y/input1_stride_x; - - for(int i=0; i<stride; i++){ - *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index)); - } -} - -__kernel void gather_1d(VECTOR_DECLARATION(input1), - VECTOR_DECLARATION(input2), - VECTOR_DECLARATION(output)) -{ - Vector in1 = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1); - Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); - Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); - - VEC_DATA_TYPE(DATA_TYPE_IN2, 2) - in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); - - //TODO: performance tuning for memcopy - int index = in2_data.s0; - *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index)); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl deleted file mode 100644 index ed7409852..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) -/** Perform hashtable_lookup of input tensor - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 - * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector - */ -__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(lookups)) -{ - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); - - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - - int lup_id[4] = {0}; - - lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) - :get_global_id(0); - lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) - :get_global_id(1); - lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) - :get_global_id(2)%DEPTH_OUT; - lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - :get_global_id(2) / DEPTH_OUT; - - if (lup_id[NUM_DIMS-1] < 0) - { - VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); - return; - } - - in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y - + lup_id[2] * input_step_z + lup_id[3] * input_step_w; - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h deleted file mode 100644 index 0e123ae0a..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_HELPER_H -#define ARM_COMPUTE_HELPER_H - -#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) - -#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) -#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) - -#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ - defined(cl_arm_integer_dot_product_accumulate_int8) -#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && - // defined(cl_arm_integer_dot_product_accumulate_int8) - -#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) -#pragma OPENCL EXTENSION cl_arm_printf : enable -#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) - -#define EXPAND(x) x - -#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) - -#define VLOAD_STR(size) vload##size -#define VLOAD(size) VLOAD_STR(size) - -#define VSTORE_STR(size) vstore##size -#define VSTORE(size) VSTORE_STR(size) - -#define VEC_DATA_TYPE_STR(type, size) type##size -#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) - -#define CL_VEC_DATA_TYPE_STR(type, size) type##size -#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size) - -#define CONVERT_STR(x, type) (convert_##type((x))) -#define CONVERT(x, type) CONVERT_STR(x, type) - -#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) - -#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) -#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) - -#define VECTOR_DECLARATION(name) \ - __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ - uint name##_offset_first_element_in_bytes - -#define IMAGE_DECLARATION(name) \ - __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_offset_first_element_in_bytes - -#define TENSOR3D_DECLARATION(name) \ - __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR4D_DECLARATION(name) \ - __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ - uint name##_step_w, uint name##_offset_first_element_in_bytes - -#define CONVERT_TO_VECTOR_STRUCT(name) \ - update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - name##_step_x) - -#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ - update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) - -#define CONVERT_TO_IMAGE_STRUCT(name) \ - update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - name##_step_x, name##_stride_y, name##_step_y) - -#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ - update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ - name##_stride_y, 0) - -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ - name##_stride_x, name##_step_x, name##_stride_y, \ - name##_step_y, name##_stride_z, name##_step_z) - -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ - name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \ - name##_step_z) - -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ - name##_stride_x, name##_step_x, name##_stride_y, \ - name##_step_y, name##_stride_z, name##_step_z) - -#define CONVERT_TO_TENSOR3D_STRUCT(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ - name##_step_z) - -#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - 0, name##_stride_y, 0, name##_stride_z, 0) - -#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ - name##_step_z, name##_stride_w, name##_step_w, mod_size) - -#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \ - mod_size) - -/** Structure to hold Vector information */ -typedef struct Vector -{ - __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ - int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ - int stride_x; /**< Stride of the image in X dimension (in bytes) */ -} Vector; - -/** Structure to hold Image information */ -typedef struct Image -{ - __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ - int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ - int stride_x; /**< Stride of the image in X dimension (in bytes) */ - int stride_y; /**< Stride of the image in Y dimension (in bytes) */ -} Image; - -/** Structure to hold 3D tensor information */ -typedef struct Tensor3D -{ - __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ - int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ - int stride_x; /**< Stride of the image in X dimension (in bytes) */ - int stride_y; /**< Stride of the image in Y dimension (in bytes) */ - int stride_z; /**< Stride of the image in Z dimension (in bytes) */ -} Tensor3D; - -/** Structure to hold 4D tensor information */ -typedef struct Tensor4D -{ - __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ - int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ - int stride_x; /**< Stride of the image in X dimension (in bytes) */ - int stride_y; /**< Stride of the image in Y dimension (in bytes) */ - int stride_z; /**< Stride of the image in Z dimension (in bytes) */ - int stride_w; /**< Stride of the image in W dimension (in bytes) */ -} Tensor4D; - -/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's - * data. - * - * @param[in] ptr Pointer to the starting postion of the buffer - * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector - * @param[in] stride_x Stride of the vector in X dimension (in bytes) - * @param[in] step_x stride_x * number of elements along X processed per - * workitem(in bytes) - * - * @return An image object - */ -inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, - uint stride_x, uint step_x) -{ - Vector vector = { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - }; - vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; - return vector; -} - -/** Wrap image information into an Image structure, and make the pointer point at this workitem's - * data. - * - * @param[in] ptr Pointer to the starting postion of the buffer - * @param[in] offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] stride_x Stride of the image in X dimension (in bytes) - * @param[in] step_x stride_x * number of elements along X processed per - * workitem(in bytes) - * @param[in] stride_y Stride of the image in Y dimension (in bytes) - * @param[in] step_y stride_y * number of elements along Y processed per - * workitem(in bytes) - * - * @return An image object - */ -inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, - uint stride_x, uint step_x, uint stride_y, uint step_y) -{ - Image img = {.ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y}; - img.ptr += - img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; - return img; -} - -/** Wrap 3D tensor information into an image structure, and make the pointer point at this - * workitem's data. - * - * @param[in] ptr Pointer to the starting postion of the buffer - * @param[in] offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] stride_x Stride of the image in X dimension (in bytes) - * @param[in] step_x stride_x * number of elements along X processed per - * workitem(in bytes) - * @param[in] stride_y Stride of the image in Y dimension (in bytes) - * @param[in] step_y stride_y * number of elements along Y processed per - * workitem(in bytes) - * @param[in] stride_z Stride of the image in Z dimension (in bytes) - * @param[in] step_z stride_z * number of elements along Z processed per - * workitem(in bytes) - * - * @return A 3D tensor object - */ -inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, - uint offset_first_element_in_bytes, - uint stride_x, uint step_x, uint stride_y, - uint step_y, uint stride_z, uint step_z) -{ - Image img = {.ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y}; - img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + - get_global_id(1) * step_y + get_global_id(2) * step_z; - return img; -} - -/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this - * workitem's data. - * - * @param[in] ptr Pointer to the starting postion of the buffer - * @param[in] offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] stride_x Stride of the image in X dimension (in bytes) - * @param[in] step_x stride_x * number of elements along X processed per - * workitem(in bytes) - * @param[in] stride_y Stride of the image in Y dimension (in bytes) - * @param[in] step_y stride_y * number of elements along Y processed per - * workitem(in bytes) - * @param[in] stride_z Stride of the image in Z dimension (in bytes) - * @param[in] step_z stride_z * number of elements along Z processed per - * workitem(in bytes) - * - * @return A 3D tensor object - */ -inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, - uint offset_first_element_in_bytes, uint stride_x, - uint step_x, uint stride_y, uint step_y, uint stride_z, - uint step_z) -{ - Tensor3D tensor = {.ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z}; - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + - get_global_id(1) * step_y + get_global_id(2) * step_z; - return tensor; -} - -inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, - uint offset_first_element_in_bytes, uint stride_x, - uint step_x, uint stride_y, uint step_y, uint stride_z, - uint step_z, uint stride_w, uint step_w, uint mod_size) -{ - Tensor4D tensor = {.ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z, - .stride_w = stride_w}; - - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + - get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + - (get_global_id(2) / mod_size) * step_w; - return tensor; -} - -/** Get the pointer position of a Vector - * - * @param[in] vec Pointer to the starting position of the buffer - * @param[in] x Relative X position - */ -inline __global const uchar *vector_offset(const Vector *vec, int x) -{ - return vec->ptr + x * vec->stride_x; -} - -/** Get the pointer position of a Image - * - * @param[in] img Pointer to the starting position of the buffer - * @param[in] x Relative X position - * @param[in] y Relative Y position - */ -inline __global uchar *offset(const Image *img, int x, int y) -{ - return img->ptr + x * img->stride_x + y * img->stride_y; -} - -/** Get the pointer position of a Tensor3D - * - * @param[in] tensor Pointer to the starting position of the buffer - * @param[in] x Relative X position - * @param[in] y Relative Y position - * @param[in] z Relative Z position - */ -inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) -{ - return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; -} - -/** Get the pointer position of a Tensor4D - * - * @param[in] tensor Pointer to the starting position of the buffer - * @param[in] x Relative X position - * @param[in] y Relative Y position - * @param[in] z Relative Z position - * @param[in] w Relative W position - */ -inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) -{ - return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + - w * tensor->stride_w; -} - -#endif // _HELPER_H diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h deleted file mode 100644 index c39138caa..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_HELPERS_ASYMM_H -#define ARM_COMPUTE_HELPERS_ASYMM_H - -#include "helpers.h" - -/** Correctly-rounded-to-nearest division by a power-of-two. - * - * @param[in] size Size of vector. - * - * @return Correctly-rounded-to-nearest division by a power-of-two. - */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - VEC_DATA_TYPE(int, size) \ - mask = (1 << exponent) - 1; \ - const VEC_DATA_TYPE(int, size) zero = 0; \ - const VEC_DATA_TYPE(int, size) one = 1; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, x < 0); \ - return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ - } - -/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), - * rounding to the nearest value, and saturating -1 * -1 to the maximum value. - * - * @param[in] size Size of vector. - * - * @return Product of two fixed-point numbers. - */ -#define ASYMM_MULT_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(int, size) \ - overflow = a == b && a == INT_MIN; \ - VEC_DATA_TYPE(long, size) \ - a_64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b_64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - ab_64 = a_64 * b_64; \ - /* COMPMID-907 */ \ - VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ - return select(ab_x2_high32, INT_MAX, overflow); \ - } - -/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). - * - * @param[in] size Size of vector. - * - * @return Result in fixed-point format Q0. - */ -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ - a) \ - { \ - const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ - const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ - const int k_fractional_bits = 31; \ - VEC_DATA_TYPE(int, size) \ - x = a + (1 << (k_fractional_bits - 3)); \ - VEC_DATA_TYPE(int, size) \ - x2 = ASYMM_MULT(x, x, size); \ - VEC_DATA_TYPE(int, size) \ - x3 = ASYMM_MULT(x2, x, size); \ - VEC_DATA_TYPE(int, size) \ - x4 = ASYMM_MULT(x2, x2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2 = \ - ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ - ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ - return constant_term + \ - ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ - } - -/** Each bit of the result is set to the corresponding bit of either then_val or - * else_val depending on whether the corresponding bit of if_mask is set. - * Equivalent to the VBSL instruction in ARM NEON. - * - * @param[in] size Size of vector. - * - * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding - * bit in @p if_mask is set or not. - */ -#define ASYMM_SELECT_USING_MASK_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \ - VEC_DATA_TYPE(int, size) then_val, \ - VEC_DATA_TYPE(int, size) else_val) \ - { \ - return (if_mask & then_val) ^ (~if_mask & else_val); \ - } - -/** For each element of input vector, the corresponding bits of the result item are set - * if the input item is zero. - * - * @param[in] size Size of vector. - * - * @returns Output vector with bits set when corresponding bit in @p a is zero. - */ -#define ASYMM_MASK_IF_ZERO_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) all_zeros = 0; \ - const VEC_DATA_TYPE(int, size) all_ones = ~0; \ - return select(all_zeros, all_ones, a == 0); \ - } - -/** For each element of input vector, the corresponding bits of the result item are set - * if the input item is non-zero. - * - * @param[in] size Size of vector. - * - * @returns Output vector with bits set when corresponding bit in @p a is non zero. - */ -#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) all_zeros = 0; \ - const VEC_DATA_TYPE(int, size) all_ones = ~0; \ - return select(all_zeros, all_ones, a != 0); \ - } - -#define EXP_BARREL_SHIFTER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ - VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ - int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ - { \ - if (k_integer_bits > exponent) \ - { \ - const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ - return ASYMM_SELECT_USING_MASK( \ - ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ - ASYMM_MULT(result, fp_multiplier, size), result, size); \ - } \ - \ - return result; \ - } - -/** Calculates \f$ exp(x) \f$ for x < 0. - * - * @param[in] size Size of vector. - * - * @return Result in fixed-point format Q0. - */ -#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ - { \ - const int k_fractional_bits = 31 - k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - k_one_quarter = 1 << (k_fractional_bits - 2); \ - VEC_DATA_TYPE(int, size) \ - mask = k_one_quarter - 1; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ - a_mod_quarter_minus_one_quarter_scaled, size); \ - VEC_DATA_TYPE(int, size) \ - remainder = a_mod_quarter_minus_one_quarter - a; \ - \ - result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ - size); \ - result = \ - EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ - \ - if (k_integer_bits > 5) \ - { \ - const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ - result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ - return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ - } - -/** Calculates the product of a integer value by a power of two, with either a positive exponent - * (equivalent to an arithmetic left shift, saturating) or a negative exponent - * (equivalent to an arithmetic right shift, rounding to nearest). - * - * @param[in] size Size of vector. - * - * @return Arithmetic left or right shift. - */ -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - if (exponent < 0) \ - { \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) min = INT_MIN; \ - const VEC_DATA_TYPE(int, size) max = INT_MAX; \ - int threshold = ((1 << (31 - exponent)) - 1); \ - VEC_DATA_TYPE(int, size) \ - positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ - VEC_DATA_TYPE(int, size) \ - negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ - VEC_DATA_TYPE(int, size) \ - result = x << exponent; \ - result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ - result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ - return result; \ - } - -/** Calculates (a+b)/2, rounded to the nearest integer. - * Equivalent to VRHADD in the ARM NEON instruction set. - * - * @param[in] size Size of vector. - * - * @return (a+b)/2, rounded to the nearest integer. - */ -#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(long, size) \ - a64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - sum = a64 + b64; \ - const VEC_DATA_TYPE(long, size) one = 1; \ - const VEC_DATA_TYPE(long, size) minus_one = -1; \ - VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, sum >= 0); \ - return convert_int##size((sum + sign) / 2); \ - } - -/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). - * - * @param[in] size Size of vector. - * - * @return Result in fixed-point format Q0. - */ -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ - const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ - VEC_DATA_TYPE(int, size) \ - half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ - const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ - const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ - VEC_DATA_TYPE(int, size) \ - x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ - for (int i = 0; i < 3; i++) \ - { \ - VEC_DATA_TYPE(int, size) \ - half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ - VEC_DATA_TYPE(int, size) \ - one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ - VEC_DATA_TYPE(int, size) \ - tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ - x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ - } \ - return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ - } - -/** Considering the integer value as fixed-point, change the number of integer bits and update value - * accordingly. - * - * @param[in] size Size of vector. - * - * @return Rescaled value. - */ -#define ASYMM_RESCALE_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \ - int src_integer_bits, int dst_integer_bits) \ - { \ - int exponent = src_integer_bits - dst_integer_bits; \ - return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ - } - -#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ - asymm_rounding_divide_by_POW2_##size(x, exponent) -#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) -#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ - ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ - asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) -#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ - asymm_select_using_mask##size(if_mask, then_val, else_val) -#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) -#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) -#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ - remainder, size) \ - exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ - remainder) -#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \ - asymm_exp_on_negative_values##size(a, k_integer_bits) -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \ - asymm_one_over_one_plus_x_for_x_in_0_1##size(a) -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ - asymm_saturating_rounding_mult_by_pow2##size(x, exponent) -#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) -#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ - asymm_rescale##size(value, src_integer_bits, dst_integer_bits) - -ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) -ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) -ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) -ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) - -ASYMM_MULT_IMPL(2) -ASYMM_MULT_IMPL(4) -ASYMM_MULT_IMPL(8) -ASYMM_MULT_IMPL(16) - -ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) -ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) -ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) -ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) - -ASYMM_SELECT_USING_MASK_IMPL(2) -ASYMM_SELECT_USING_MASK_IMPL(4) -ASYMM_SELECT_USING_MASK_IMPL(8) -ASYMM_SELECT_USING_MASK_IMPL(16) - -ASYMM_MASK_IF_ZERO_IMPL(2) -ASYMM_MASK_IF_ZERO_IMPL(4) -ASYMM_MASK_IF_ZERO_IMPL(8) -ASYMM_MASK_IF_ZERO_IMPL(16) - -ASYMM_MASK_IF_NON_ZERO_IMPL(2) -ASYMM_MASK_IF_NON_ZERO_IMPL(4) -ASYMM_MASK_IF_NON_ZERO_IMPL(8) -ASYMM_MASK_IF_NON_ZERO_IMPL(16) - -EXP_BARREL_SHIFTER_IMPL(2) -EXP_BARREL_SHIFTER_IMPL(4) -EXP_BARREL_SHIFTER_IMPL(8) -EXP_BARREL_SHIFTER_IMPL(16) - -ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) -ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) -ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) -ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) - -ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) -ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) -ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) -ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) - -ASYMM_ROUNDING_HALF_SUM_IMPL(2) -ASYMM_ROUNDING_HALF_SUM_IMPL(4) -ASYMM_ROUNDING_HALF_SUM_IMPL(8) -ASYMM_ROUNDING_HALF_SUM_IMPL(16) - -ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) -ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) -ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) -ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) - -ASYMM_RESCALE_IMPL(2) -ASYMM_RESCALE_IMPL(4) -ASYMM_RESCALE_IMPL(8) -ASYMM_RESCALE_IMPL(16) - -#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl deleted file mode 100644 index e3aa463db..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Performs a negation of input tensor. - * - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * - * @param[in] in_ptr Pointer to the source image. Supported data types: S16/S32/F16/F32. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - */ -__kernel void neg_tensor( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl deleted file mode 100644 index ecf4696e9..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4 - * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * - * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for - * batch), Top(for height), Left(for width) and Front(for depth) are - * required. Supported data type: S32 - */ - -__kernel void pad( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int4 pad_values) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int index[4]={0}; - - index[0] = get_global_id(0);//W - index[1] = get_global_id(1);//H - index[2] = get_global_id(2) % DEPTH_OUT;//C - index[3] = get_global_id(2) / DEPTH_OUT;//N - - if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) || - index[1] < pad_values.y || index[1] >= (IH + pad_values.y) || - index[2] < pad_values.z || index[2] >= (ID + pad_values.z) || - index[3] < pad_values.w || index[3] >= (IB + pad_values.w)) - { - *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; - } - else - { - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *) - tensor4D_offset(&in, index[0] - pad_values.x, - index[1] - pad_values.y, - index[2] - pad_values.z, - index[3] - pad_values.w)); - } - } - -#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl deleted file mode 100644 index 7cc8b0354..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) -/** Perform a Generic permute operation on an input tensor of Shape DCHW. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1 -6/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in b -ytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in b -ytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in b -ytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void permute_generic( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - int out_index[4]; - int in_index[4]; - in_index[0] = get_global_id(0);//W - in_index[1] = get_global_id(1);//H - in_index[2] = get_global_id(2) % DEPTH_IN;//C - in_index[3] = get_global_id(2) / DEPTH_IN;//B - out_index[0] = in_index[P1]; - out_index[1] = in_index[P2]; - out_index[2] = in_index[P3]; - out_index[3] = in_index[P4]; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl deleted file mode 100644 index aa05121b1..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifdef SATURATE -#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) -#else /* SATURATE */ -#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) -#endif /* SATURATE */ -#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) - -/** Performs a pixelwise division with float scale of either integer or float inputs. - * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short - * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. - * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. - * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided. - * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Float scaling factor. Supported data types: F32 - */ -__kernel void pixelwise_div_float( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out), - const float scale) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(DATA_TYPE_RES, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); - VEC_DATA_TYPE(DATA_TYPE_RES, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); - - // Perform division -#ifdef DATA_TYPE_FLOAT - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); -#else /* DATA_TYPE_FLOAT */ - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND); -#endif /* DATA_TYPE_FLOAT */ - - // Store result - vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl deleted file mode 100644 index fdfb78003..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(SATURATE) -#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x)) -#else // SATURATE -#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x)) -#endif // SATURATE -#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size) - -#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size) - -/** Performs a pixelwise division with integer scale of integer inputs. - * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short - * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. - * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. - * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/S16 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Integer scaling factor. Supported data types: S32 - */ -__kernel void pixelwise_div_int( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out), - const uint scale) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(DATA_TYPE_RES, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); - VEC_DATA_TYPE(DATA_TYPE_RES, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); - - // Perform division and store result - vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl deleted file mode 100644 index ab1307e64..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers_asymm.h" - -#ifdef SATURATE -#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) -#else /* SATURATE */ -#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) -#endif /* SATURATE */ -#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) - -#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) -/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 - * - * The following computations will be performed by the kernel: - * - * -# Add offset terms to inputs - * -# Multiply inputs - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Shift the int32 accumulator by result_shift - * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. - * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET - * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT - * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8 - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Float scaling factor. Supported data types: F32 - */ -__kernel void pixelwise_mul_qasymm8( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out), - const float scale) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(int, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); - VEC_DATA_TYPE(int, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); - - // Perform multiplication of two inputs - VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); - VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); - VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; - - // Multiply with a multiplier smaller than 1 - out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); - out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); - - VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); - -// TODO: Apply min-max BOUND to support fuse with relu. -/* -#if defined(MIN_BOUND) - res = max(res, (uchar16)MIN_BOUND); -#endif // defined(MIN_BOUND) -#if defined(MAX_BOUND) - res = min(res, (uchar16)MAX_BOUND); -#endif // defined(MAX_BOUND) -*/ - - // Store result - VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), - 0, (__global DATA_TYPE_OUT *)out.ptr); -} -#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl deleted file mode 100644 index 68da2ba32..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Can only take floating point data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported Data types : F16/F32 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[in] alpha_ptr Pointer to the source image. Supported Data types : F16/F32 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void prelu( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ? - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) : - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), - 0, (__global DATA_TYPE *)output.ptr); - -} -#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl deleted file mode 100644 index 7e97b7ed6..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" -#define SUB(x, y) (x) - (y) - -#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) - -#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) -#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) -#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) -#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) - -/** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. - * - * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Can only take uchar data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported Data types : QASYMM8 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[in] alpha_ptr Pointer to the source image. Supported Data types : QASYMM8 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void prelu_qasymm8( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); - VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); - - in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1)); - in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2)); - - const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); - const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); - const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32; - const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); - const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); - - VSTORE(VEC_SIZE) - (res, 0, (__global uchar *)output.ptr); -} - -#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl deleted file mode 100644 index 8bef49363..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) -/** Perform reduce max/min - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] axis Axis through which reduction occurs - * @param[in] dim Dimension across the axis to be reduced. - */ -__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - for(int i = 1; i < dim; ++i) - { - indices[axis] = i; - - #if OP_CODE == 1 // REDUCE_MAX - value = max(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - - #elif OP_CODE == 2 // REDUCE_MIN - value = min(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - - #else // OP NOT SUPPORTED - return; - - #endif - } - - *((__global DATA_TYPE *)out.ptr) = value; -} - -/** Perform reduce sum/mean - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] axis Axis through which reduction occurs - * @param[in] dim Dimension across the axis to be reduced. - */ -__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE sum_value = (DATA_TYPE)0; - for(int i = 0; i < dim; ++i) - { - indices[axis] = i; - sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - } - - #if OP_CODE == 3 // REDUCE_SUM - *((__global DATA_TYPE *)out.ptr) = sum_value; - - #elif OP_CODE == 4 // REDUCE_MEAN - *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); - - #else // OP NOT SUPPORTED - return; - - #endif -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl deleted file mode 100644 index a0fc2d5a9..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) -/** Perform space to batch with input of 4D and NCHW format - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 - * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 - * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 - * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(block_size), - IMAGE_DECLARATION(padding_size)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int block_size_x = *((__global int *)(block_size_ptr)); - int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); - int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; - int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; - - int in_index[4] = {0, }; - in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); - in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); - in_index[2] = get_global_id(2) % DEPTH_OUT; - in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; - - if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) - { - *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; - } - else - { - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); - } -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) - -#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) -/** Perform space to batch with input of 4D and NHWC format - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 - * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 - * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 - * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 - * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(block_size), - IMAGE_DECLARATION(padding_size)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); - - int block_size_x = *((__global int *)(block_size_ptr)); - int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); - int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; - int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; - - int in_index[4] = {0, }; - in_index[0] = get_global_id(0) * VEC_SIZE; - in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); - in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); - in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; - - if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) - { - VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); - } - else - { - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); - } -} - -#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl deleted file mode 100644 index f6977045a..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void space_to_depth( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - int out_index[4]={0}; - int in_index[4]={0}; - - in_index[0] = get_global_id(0);//W - in_index[1] = get_global_id(1);//H - in_index[2] = get_global_id(2) % DEPTH_IN;//C - in_index[3] = get_global_id(2) / DEPTH_IN;//B - - out_index[0] = in_index[0]/BLOCK_SIZE; - out_index[1] = in_index[1]/BLOCK_SIZE; - out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; - out_index[3] = in_index[3]; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); - } -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl deleted file mode 100644 index 3e1a5c97f..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Returns true value of squared_difference of two tensors. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Can only take floating point data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[in] input2_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16/F32 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void squared_difference( - TENSOR3D_DECLARATION(input1), - TENSOR3D_DECLARATION(input2), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); - Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr); - - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - sq_diff = diff * diff; - - VSTORE(VEC_SIZE) - (sq_diff, 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl deleted file mode 100644 index b39c55b96..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) -/** Extracts a strided slice up to 4-dimensions - * - * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 - * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 - */ -__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int4 starts, - const int4 strides) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int4 indices_in = - { - starts.x + (strides.x * get_global_id(0)), - starts.y + (strides.y * get_global_id(1)), - starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)), - starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)), - }; - *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); -} -#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl deleted file mode 100644 index d97f23a47..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.h" - -__kernel void topkv2_init(VECTOR_DECLARATION(input), - __global float* in_key_buf, - __global int* in_ind_buf, - const int n) -{ - int gid = get_global_id(0); - int lws = get_local_size(0); - int groups = get_num_groups(0); - int gws = lws * groups; - int iter = n / gws; - - Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); - - for(int i = 0; i < iter; ++i) - { - int idx = i * gws + gid; - in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x); - in_ind_buf[idx] = idx; - } -} - -__kernel void topkv2_find_first_negative( - __global float *out_key_buf, - __global int *first_negative_idx, - int n) -{ - int gid = get_global_id(0); - - if( gid == n - 1 ) - { - // if the last item is positive, the first negative index is n. - if( out_key_buf[gid] > 0.f ) - *first_negative_idx = n; - } else if ( gid == 0 ) { - // if the first item is negative, set it 0. - if( out_key_buf[gid] < 0.f ) - *first_negative_idx = 0; - } else { - // if its left is positive and it is negative, then it is the first negative item. - if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f ) - *first_negative_idx = gid; - } -} - -__kernel void topkv2_reorder_negatives( - __global float* in_key_buf, - __global float* out_key_buf, - __global float* in_ind_buf, - __global float* out_ind_buf, - __global int* first_negative_idx, - int n) -{ - int gid = get_global_id(0); - - int num_negs = n - *first_negative_idx; - int in_idx; - - if( gid < num_negs ) { - in_idx = n - 1 - gid; - } else { - in_idx = gid - num_negs; - } - - out_key_buf[gid] = in_key_buf[in_idx]; - out_ind_buf[gid] = in_ind_buf[in_idx]; -} - -__kernel void topkv2_store( - VECTOR_DECLARATION(values), - VECTOR_DECLARATION(indices), - __global float *out_key_buf, - __global int *out_ind_buf, - int n) -{ - int gid = get_global_id(0); - - Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values); - Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); - - int idx = n - 1 - gid; - - *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; - *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl deleted file mode 100644 index 0292fab04..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.h" - -__global inline float* get_vec_elem(Vector* vec, int idx) -{ - return (__global float*)(vec->ptr + idx * vec->stride_x); -} - -__global inline int* get_vec_elem_int(Vector* vec, int idx) -{ - return (__global int*)(vec->ptr + idx * vec->stride_x); -} - -// A utility function to swap two elements -void swap(__global float *a, __global float *b) -{ - float t = *a; - *a = *b; - *b = t; -} - -void swap_idx(__global int *a, __global int *b) -{ - int t = *a; - *a = *b; - *b = t; -} - -/* This function is same in both iterative and recursive*/ -int partition (Vector* arr, __global int* indices, int l, int h) -{ - float x = *get_vec_elem(arr, h); - int i = (l - 1); - - for (int j = l; j <= h- 1; j++) - { - if (*get_vec_elem(arr, j) >= x) - { - i++; - swap (get_vec_elem(arr,i), get_vec_elem(arr,j)); - swap_idx(&indices[i], &indices[j]); - } - } - swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); - swap_idx(&indices[i + 1], &indices[h]); - return (i + 1); -} - -/* A[] --> Array to be sorted, - l --> Starting index, - h --> Ending index */ -void quickSortIterative (Vector* arr, __global int* indices, - __global int *stack, int l, int h) -{ - // Create an auxiliary stack - - // initialize top of stack - int top = -1; - - // push initial values of l and h to stack - stack[ ++top ] = l; - stack[ ++top ] = h; - - // Keep popping from stack while is not empty - while ( top >= 0 ) - { - // Pop h and l - h = stack[ top-- ]; - l = stack[ top-- ]; - - // Set pivot element at its correct position - // in sorted array - int p = partition( arr, indices, l, h ); - - // If there are elements on left side of pivot, - // then push left side to stack - if ( p-1 > l ) - { - stack[ ++top ] = l; - stack[ ++top ] = p - 1; - } - - // If there are elements on right side of pivot, - // then push right side to stack - if ( p+1 < h ) - { - stack[ ++top ] = p + 1; - stack[ ++top ] = h; - } - } -} - -__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices), - __global int* indices, __global int* temp_stack, int k, int n) -{ - Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); - Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); - Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); - - for( int i = 0; i < n; ++i ) - { - indices[i] = i; - } - - quickSortIterative(&input, indices, temp_stack, 0, n-1); - - // extract k items. - for(int i = 0; i < k; ++i) - { - *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); - *get_vec_elem_int(&topk_indices, i) = indices[i]; - } -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl deleted file mode 100644 index c2c2d89a4..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// reference: -// https://code.google.com/archive/p/ocl-radix-sort/source/default/source -// OpenCL kernel sources for the CLRadixSort class -// the #include does not exist in OpenCL -// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr -// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html -// if you find this software usefull you can cite the following work in your reports or articles: -// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. -// http://hal.archives-ouvertes.fr/hal-00596730 - -// Reference for floating point radix sort: -// http://www.codercorner.com/RadixSortRevisited.htm - -// compute the histogram for each radix and each virtual processor for the pass -__kernel void radixsort_histogram(__global float* in_key_buf, - __global int* d_Histograms, - const int pass, - __local int* loc_histo, - const int n) -{ - int it = get_local_id(0); // i local number of the processor - int ig = get_global_id(0); // global number = i + g I - - int gr = get_group_id(0); // g group number - - int groups = get_num_groups(0); - int items = get_local_size(0); - - // set the local histograms to zero - for(int ir=0;ir<_RADIX;ir++){ - loc_histo[ir * items + it] = 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // range of keys that are analyzed by the work item - int size= n/groups/items; // size of the sub-list - int start= ig * size; // beginning of the sub-list - - unsigned int key; - int shortkey,k; - - // compute the index - // the computation depends on the transposition - for(int j = 0; j < size ; j++) { -#ifdef TRANSPOSE - k= groups * items * j + ig; -#else - k=j+start; -#endif - - key = *((__global unsigned int*)(in_key_buf + k)); - - // extract the group of _BITS bits of the pass - // the result is in the range 0.._RADIX-1 - shortkey=(( key >> (pass * _BITS)) & (_RADIX-1)); - - // increment the local histogram - loc_histo[shortkey * items + it ]++; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // copy the local histogram to the global one - for(int ir=0;ir<_RADIX;ir++) { - d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; - } - - barrier(CLK_GLOBAL_MEM_FENCE); -} - -// initial transpose of the list for improving -// coalescent memory access -__kernel void transpose(const __global int* invect, - __global int* outvect, - const int nbcol, - const int nbrow, - const __global int* inperm, - __global int* outperm, - __local int* blockmat, - __local int* blockperm, - const int tilesize){ - - int i0 = get_global_id(0)*tilesize; // first row index - int j = get_global_id(1); // column index - - int jloc = get_local_id(1); // local column index - - // fill the cache - for(int iloc=0;iloc<tilesize;iloc++){ - int k=(i0+iloc)*nbcol+j; // position in the matrix - blockmat[iloc*tilesize+jloc]=invect[k]; -#ifdef PERMUT - blockperm[iloc*tilesize+jloc]=inperm[k]; -#endif - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // first row index in the transpose - int j0=get_group_id(1)*tilesize; - - // put the cache at the good place - for(int iloc=0;iloc<tilesize;iloc++){ - int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose - outvect[kt]=blockmat[jloc*tilesize+iloc]; -#ifdef PERMUT - outperm[kt]=blockperm[jloc*tilesize+iloc]; -#endif - } - -} - -// each virtual processor reorders its data using the scanned histogram -__kernel void radixsort_reorder(__global float* in_key, - __global float* out_key, - __global int* d_Histograms, - const int pass, - __global int* indices_in, - __global int* indices_out, - __local int* loc_histo, - const int n){ - - int it = get_local_id(0); - int ig = get_global_id(0); - - int gr = get_group_id(0); - int groups=get_num_groups(0); - int items=get_local_size(0); - - int start= ig *(n/groups/items); - int size= n/groups/items; - - // take the histogram in the cache - for(int ir=0;ir<_RADIX;ir++){ - loc_histo[ir * items + it]= - d_Histograms[items * (ir * groups + gr) + it]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int newpos,shortkey,k,newpost; - unsigned int key; - - for(int j= 0; j< size;j++){ -#ifdef TRANSPOSE - k= groups * items * j + ig; -#else - k=j+start; -#endif - float org_value = in_key[k]; - key = *(__global unsigned int*)(in_key + k); - shortkey=((key >> (pass * _BITS)) & (_RADIX-1)); - - newpos=loc_histo[shortkey * items + it]; - -#ifdef TRANSPOSE - int ignew,jnew; - ignew= newpos/(n/groups/items); - jnew = newpos%(n/groups/items); - newpost = jnew * (groups*items) + ignew; -#else - newpost=newpos; -#endif - - //d_outKeys[newpost]= key; // killing line !!! - out_key[newpost] = org_value; - -#ifdef PERMUT - indices_out[newpost] = indices_in[k]; -#endif - - newpos++; - loc_histo[shortkey * items + it]=newpos; - } -} - -// perform a parallel prefix sum (a scan) on the local histograms -// (see Blelloch 1990) each workitem worries about two memories -// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html -__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum) -{ - int it = get_local_id(0); - int ig = get_global_id(0); - int decale = 1; - int n=get_local_size(0) * 2 ; - int gr=get_group_id(0); - - // load input into local memory - // up sweep phase - temp[2*it] = histo[2*ig]; - temp[2*it+1] = histo[2*ig+1]; - - // parallel prefix sum (algorithm of Blelloch 1990) - for (int d = n>>1; d > 0; d >>= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - if (it < d){ - int ai = decale*(2*it+1)-1; - int bi = decale*(2*it+2)-1; - temp[bi] += temp[ai]; - } - decale *= 2; - } - - // store the last element in the global sum vector - // (maybe used in the next step for constructing the global scan) - // clear the last element - if (it == 0) { - globsum[gr]=temp[n-1]; - temp[n - 1] = 0; - } - - // down sweep phase - for (int d = 1; d < n; d *= 2){ - decale >>= 1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (it < d){ - int ai = decale*(2*it+1)-1; - int bi = decale*(2*it+2)-1; - - int t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; - } - - } - barrier(CLK_LOCAL_MEM_FENCE); - - // write results to device memory - - histo[2*ig] = temp[2*it]; - histo[2*ig+1] = temp[2*it+1]; - - barrier(CLK_GLOBAL_MEM_FENCE); - -} - -// use the global sum for updating the local histograms -// each work item updates two values -__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum) -{ - int ig = get_global_id(0); - int gr=get_group_id(0); - - int s; - - s=globsum[gr]; - - // write results to device memory - histo[2*ig] += s; - histo[2*ig+1] += s; - - barrier(CLK_GLOBAL_MEM_FENCE); -} |