diff options
Diffstat (limited to 'compute/ARMComputeEx/src/core/CL/cl_kernels')
9 files changed, 374 insertions, 1206 deletions
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl deleted file mode 100644 index 03717cfe9..000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) -/** Perform arg_max/arg_min - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. - * e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. - * e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as - * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: - * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension - * (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension - * (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element - * in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension - * (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. - * Supported data types: U32 - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension - * (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - * @param[in] axis Axis through which reduction occurs - * @param[in] dim Dimension across the axis to be reduced. - */ - -__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = { - get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE value = - *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - DATA_TYPE tval = value; - int idx = 0; - for (int i = 1; i < dim; ++i) - { - indices[axis] = i; - -#if OP_CODE == 1 // ArgMax - value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], - indices[2], indices[3]))); -#elif OP_CODE == 2 // ArgMin - value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], - indices[2], indices[3]))); -#else - return; - -#endif - - if (tval != value) - { - idx = indices[axis]; - tval = value; - } - } - - *((__global uint *)out.ptr) = idx; -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl deleted file mode 100644 index f74c1c103..000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers_asymm.h" - -#ifdef SATURATE -#define ADD(x, y) add_sat((x), (y)) -#define SUB(x, y) sub_sat((x), (y)) -#else /* SATURATE */ -#define ADD(x, y) (x) + (y) -#define SUB(x, y) (x) - (y) -#endif /* SATURATE */ - -/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to - * QASYMM8 - * - * The following computations will be performed: - * - * -# Add offset terms to inputs - -# Get scaled value of two inputs - * -# Add inputs - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Shift the int32 accumulator by result_shift - * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. - * - * @attention The inputs and output data types need to be passed at compile time using - * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The number of bits to shift left of input tensors must be passed at compile time using - * -DLEFT_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of input tensors - * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, - -DIN2_OFFSET, - * -RIN2_MULT_INT and -DIN2_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of output tensor - * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and - -DRESULT_SHIFT - * - * @attention The input and output data_types need to be passed at compile time using - * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The inputs and output scale information of qasymm8 need to be passed at compile time - * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: - * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f - * @attention The inputs and output scale offset need to be passed at compile time using - * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: - * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise - * wrapping policy will be used. - * - * @param[in] in1_ptr Pointer to the source tensor. - * Supported data types: QASYMM8 - * @param[in] in1_stride_x Stride of the source tensor in X dimension - * (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed - * per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source tensor in Y dimension - * (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed - * per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Z processed - * per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source - * tensor - * @param[in] in2_ptr Pointer to the source tensor. Supported data types: - * QASYMM8 - * @param[in] in2_stride_x Stride of the source tensor in X dimension - * (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed - * per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source tensor in Y dimension - * (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed - * per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Z processed - * per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source - * tensor - * @param[out] out_ptr Pointer to the destination tensor. - * Supported data types: QASYMM8 - * @param[in] out_stride_x Stride of the destination tensor in X dimension - * (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed - * per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination tensor in Y dimension - * (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed - * per workitem(in bytes) - * @param[in] out_stride_z Stride of the source tensor in Z dimension - * (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Z processed - * per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination - * tensor - */ -__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(int, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); - VEC_DATA_TYPE(int, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); - - // Get scaled value of two inputs - VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); - VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); - - VEC_DATA_TYPE(int, 16) - left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); - VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; - VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; - - VEC_DATA_TYPE(int, 16) - scaled_in1_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); - VEC_DATA_TYPE(int, 16) - scaled_in2_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); - - // Add inputs and multiply with a multiplier smaller than 1 - VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; - VEC_DATA_TYPE(int, 16) - out_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); - out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); - - VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); - - // TODO: Apply min-max BOUND to support fuse with relu. - /* - #if defined(MIN_BOUND) - res = max(res, (uchar16)MIN_BOUND); - #endif // defined(MIN_BOUND) - #if defined(MAX_BOUND) - res = min(res, (uchar16)MAX_BOUND); - #endif // defined(MAX_BOUND) - */ - - // Store result - VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl deleted file mode 100644 index 4147a0017..000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#ifndef SCALE -#define SCALE 1.0f -#endif -#ifndef OFFSET -#define OFFSET 0 -#endif -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) -/** Perform a cast operation on an input tensor. - * - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and - * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * @attention -DBOOL_INPUT : Whether type of input is bool. - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) - res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); -#if defined(BOOL_INPUT) - VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE)); - VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1); - res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); -#endif // defined(BOOL_INPUT) - - VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr); -} - -/** Perform a cast operation on an QASYMM8 input tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and - * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of input should be given as a preprocessor argument using - * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) - in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - - VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; - - VSTORE(VEC_SIZE) - (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, - (__global DATA_TYPE_OUT *)output.ptr); -} - -/** Perform a cast operation on an QASYMM8 output tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and - * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of output should be given as a preprocessor argument using - * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) - in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - - VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); - - VSTORE(VEC_SIZE) - (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, - (__global DATA_TYPE_OUT *)output.ptr); -} -#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl deleted file mode 100644 index 0285c955b..000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. - * e.g. -DDEPTH_OUT=16 - * @attention The value of the z-axis of output tensor should be given as a preprocessor argument - * using -DZ_OUT=size. e.g. -DZ_OUT=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. - * -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); - - int out_index[4] = {0}; - int in_index[4] = {0}; - - out_index[0] = get_global_id(0); // W - out_index[1] = get_global_id(1); // H - out_index[2] = get_global_id(2) % Z_OUT; // C - out_index[3] = get_global_id(2) / Z_OUT; // B - - in_index[0] = out_index[0] / BLOCK_SIZE; - in_index[1] = out_index[1] / BLOCK_SIZE; - in_index[2] = out_index[2] + - ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; - in_index[3] = out_index[3]; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( - &in, in_index[0], in_index[1], in_index[2], in_index[3])); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) -/** Perform space to depth rearrangement of tensor (NHWC) - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. - * e.g. -DDEPTH_OUT=16 - * @attention The value of the z-axis of output tensor should be given as a preprocessor argument - * using -DZ_OUT=size. e.g. -DZ_OUT=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. - * -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); - - int out_index[4] = {0}; - int in_index[4] = {0}; - - out_index[0] = get_global_id(0); // C - out_index[1] = get_global_id(1); // W - out_index[2] = get_global_id(2) % Z_OUT; // H - out_index[3] = get_global_id(2) / Z_OUT; // B - - in_index[0] = out_index[0] + - ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT; - in_index[1] = out_index[1] / BLOCK_SIZE; - in_index[2] = out_index[2] / BLOCK_SIZE; - in_index[3] = out_index[3]; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( - &in, in_index[0], in_index[1], in_index[2], in_index[3])); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h index 2d0b6a299..e07a25ec9 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,7 +37,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #ifndef ARM_COMPUTE_HELPER_H #define ARM_COMPUTE_HELPER_H @@ -59,16 +58,219 @@ #pragma OPENCL EXTENSION cl_arm_printf : enable #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) +#define GPU_ARCH_MIDGARD 0x100 +#define GPU_ARCH_BIFROST 0x200 + +/** Concatenate two inputs. + * + * @param[in] a The first input to be concatenated + * @param[in] b The second input to be concatenated + * + * @return The concatenated output + */ +#define CONCAT(a, b) a##b + +/** Expand the given vector + * + * @param[in] x The vector to be expanded + * + * @return The expanded output + */ #define EXPAND(x) x +/** Clamp the given value between an upper and lower bound. + * + * @param[in] x The value to be clamped + * @param[in] min_val The lower bound + * @param[in] max_val The upper bound + * + * @return The clamped value. + */ #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) +/** REVn reverses the given vector whose size is n. + * @name REVn + * + * @param[in] x The vector to be reversed + * + * @return The reversed vector + * @{ + */ +#define REV1(x) ((x)) +#define REV2(x) ((x).s10) +#define REV3(x) ((x).s210) +#define REV4(x) ((x).s3210) +#define REV8(x) ((x).s76543210) +#define REV16(x) ((x).sFEDCBA9876543210) +/** @} */ // end of group REVn + +/** Reverse the given vector. + * @name REVERSE + * + * @param[in] x The vector to be reversed + * @param[in] s The size of the vector + * + * @return The reversed vector + * @{ + */ +#define REVERSE_STR(x, s) REV##s((x)) +#define REVERSE(x, s) REVERSE_STR(x, s) +/** @} */ // end of group REVERSE + +/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. + * @name ROTs_n + * + * @param[in] x The vector to be shifted + * + * @return The shifted vector + * @{ + */ +#define ROT1_0(x) ((x)) + +#define ROT2_0(x) ((x)) +#define ROT2_1(x) ((x).s10) + +#define ROT3_0(x) ((x)) +#define ROT3_1(x) ((x).s201) +#define ROT3_2(x) ((x).s120) + +#define ROT4_0(x) ((x)) +#define ROT4_1(x) ((x).s3012) +#define ROT4_2(x) ((x).s2301) +#define ROT4_3(x) ((x).s1230) + +#define ROT8_0(x) ((x)) +#define ROT8_1(x) ((x).s70123456) +#define ROT8_2(x) ((x).s67012345) +#define ROT8_3(x) ((x).s56701234) +#define ROT8_4(x) ((x).s45670123) +#define ROT8_5(x) ((x).s34567012) +#define ROT8_6(x) ((x).s23456701) +#define ROT8_7(x) ((x).s12345670) + +#define ROT16_0(x) ((x)) +#define ROT16_1(x) ((x).sF0123456789ABCDE) +#define ROT16_2(x) ((x).sEF0123456789ABCD) +#define ROT16_3(x) ((x).sDEF0123456789ABC) +#define ROT16_4(x) ((x).sCDEF0123456789AB) +#define ROT16_5(x) ((x).sBCDEF0123456789A) +#define ROT16_6(x) ((x).sABCDEF0123456789) +#define ROT16_7(x) ((x).s9ABCDEF012345678) +#define ROT16_8(x) ((x).s89ABCDEF01234567) +#define ROT16_9(x) ((x).s789ABCDEF0123456) +#define ROT16_10(x) ((x).s6789ABCDEF012345) +#define ROT16_11(x) ((x).s56789ABCDEF01234) +#define ROT16_12(x) ((x).s456789ABCDEF0123) +#define ROT16_13(x) ((x).s3456789ABCDEF012) +#define ROT16_14(x) ((x).s23456789ABCDEF01) +#define ROT16_15(x) ((x).s123456789ABCDEF0) +/** @} */ // end of group ROTs_n + +/** Circular-right-shift (rotate-right) the given vector by the given amount. + * @name ROTATE + * + * @param[in] x The vector to be shifted + * @param[in] s The size of the vector + * @param[in] n The amount to be shifted + * + * @return The shifted vector + * @{ + */ +#define ROTATE_STR(x, s, n) ROT##s##_##n(x) +#define ROTATE(x, s, n) ROTATE_STR(x, s, n) +/** @} */ // end of group ROTATE + +/** Creates a vector of size n filled with offset values corresponding to the location of each + * element. + * @name V_OFFSn + * + * @param[in] dt The data type of the output vector + * + * @return The vector filled with offset values + * @{ + */ +#define V_OFFS1(dt) (dt)(0) +#define V_OFFS2(dt) (dt)(0, 1) +#define V_OFFS3(dt) (dt)(0, 1, 3) +#define V_OFFS4(dt) (dt)(0, 1, 2, 3) +#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7) +#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +/** @} */ // end of group V_OFFSn + +/** Create a vector filled with offset values corresponding to the location of each element. + * @name VEC_OFFS + * + * @param[in] dt The data type of the output vector + * @param[in] s The size of the output vector + * + * @return The vector filled with offset values + * @{ + */ +#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) +#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) +/** @} */ // end of group VEC_OFFS + #define VLOAD_STR(size) vload##size #define VLOAD(size) VLOAD_STR(size) #define VSTORE_STR(size) vstore##size #define VSTORE(size) VSTORE_STR(size) +#define float1 float +#define half1 half +#define char1 char +#define uchar1 uchar +#define short1 short +#define ushort1 ushort +#define int1 int +#define uint1 uint +#define long1 long +#define ulong1 ulong +#define double1 double + +#define vload1(OFFSET, PTR) *(OFFSET + PTR) +#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA + +// Convert built-in functions with _sat modifier are not supported in floating point so we create +// defines +// without _sat to overcome this issue +#define convert_float_sat convert_float +#define convert_float1_sat convert_float +#define convert_float2_sat convert_float2 +#define convert_float3_sat convert_float3 +#define convert_float4_sat convert_float4 +#define convert_float8_sat convert_float8 +#define convert_float16_sat convert_float16 +#define convert_half_sat convert_float +#define convert_half1_sat convert_half +#define convert_half2_sat convert_half2 +#define convert_half3_sat convert_half3 +#define convert_half4_sat convert_half4 +#define convert_half8_sat convert_half8 +#define convert_half16_sat convert_half16 + +#define convert_float1 convert_float +#define convert_half1 convert_half +#define convert_char1 convert_char +#define convert_uchar1 convert_uchar +#define convert_short1 convert_short +#define convert_ushort1 convert_ushort +#define convert_int1 convert_int +#define convert_uint1 convert_uint +#define convert_long1 convert_long +#define convert_ulong1 convert_ulong +#define convert_double1 convert_double + +#define convert_char1_sat convert_char_sat +#define convert_uchar1_sat convert_uchar_sat +#define convert_short1_sat convert_short_sat +#define convert_ushort1_sat convert_ushort_sat +#define convert_int1_sat convert_int_sat +#define convert_uint1_sat convert_uint_sat +#define convert_long1_sat convert_long_sat +#define convert_ulong1_sat convert_ulong_sat +#define convert_double1_sat convert_double_sat + #define VEC_DATA_TYPE_STR(type, size) type##size #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h index a83b1a8a5..5f1b3f902 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,29 +37,112 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #ifndef ARM_COMPUTE_HELPERS_ASYMM_H #define ARM_COMPUTE_HELPERS_ASYMM_H #include "helpers.h" +/** Convert the given vector with round to nearest even rounding mode + * + * @param[in] x The target to be converted + * @param[in] type The target type + * + * @return The converted vector + */ +#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) + +/** Quantize a floating-point scalar value to 8-bit asymmetric + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline uchar quantize_qasymm8(float input, float offset, float scale) +{ + float out_f32 = input / scale + offset; + uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); + return res_u8; +} + +/** Dequantize a scalar value from 8-bit asymmetric to floating-point + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline float dequantize_qasymm8(uchar input, float offset, float scale) +{ + return ((float)input - offset) * scale; +} + +/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline float dequantize_qasymm8_signed(char input, float offset, float scale) +{ + return ((float)input - offset) * scale; +} + +/** Quantize a vector of values from floating-point + * + * @param[in] type Output data type. + * @param[in] size Size of vector. + * + * @return quantized values + */ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \ + VEC_DATA_TYPE(type, size)); \ + return res; \ + } + +/** Dequantize a vector of values to floating-point + * + * @param[in] type Input data type. + * @param[in] size Size of vector. + * + * @return dequantized values in floating point + */ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ + } + /** Correctly-rounded-to-nearest division by a power-of-two. * * @param[in] size Size of vector. * * @return Correctly-rounded-to-nearest division by a power-of-two. */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - VEC_DATA_TYPE(int, size) \ - mask = (1 << exponent) - 1; \ - const VEC_DATA_TYPE(int, size) zero = 0; \ - const VEC_DATA_TYPE(int, size) one = 1; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, x < 0); \ - return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \ + VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), @@ -81,9 +164,19 @@ b_64 = convert_long##size(b); \ VEC_DATA_TYPE(long, size) \ ab_64 = a_64 * b_64; \ - /* COMPMID-907 */ \ + /* Revert COMPMID-907 */ \ + VEC_DATA_TYPE(long, size) \ + mask1 = 1 << 30; \ + VEC_DATA_TYPE(long, size) \ + mask2 = 1 - (1 << 30); \ + VEC_DATA_TYPE(long, size) \ + is_positive_or_zero = ab_64 >= 0; \ + VEC_DATA_TYPE(long, size) \ + nudge = select(mask2, mask1, is_positive_or_zero); \ + VEC_DATA_TYPE(long, size) \ + mask = 1ll << 31; \ VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ + ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ return select(ab_x2_high32, INT_MAX, overflow); \ } @@ -335,9 +428,18 @@ return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ } +#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) +#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) +#define DEQUANTIZE_STR(input, offset, scale, type, size) \ + dequantize_##type##size(input, offset, scale) +#define DEQUANTIZE(input, offset, scale, type, size) \ + DEQUANTIZE_STR(input, offset, scale, type, size) + #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ asymm_rounding_divide_by_POW2_##size(x, exponent) #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ + ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ @@ -360,11 +462,53 @@ #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ asymm_rescale##size(value, src_integer_bits, dst_integer_bits) +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ + right_shift, size); \ + } +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ + multiply_by_quantized_multiplier##size(input, qmul, shift) + +QUANTIZE_IMPL(uchar, 1) +QUANTIZE_IMPL(char, 1) +QUANTIZE_IMPL(uint, 1) +QUANTIZE_IMPL(int, 1) +QUANTIZE_IMPL(uchar, 4) +QUANTIZE_IMPL(ushort, 4) +QUANTIZE_IMPL(short, 4) +QUANTIZE_IMPL(uchar, 16) +QUANTIZE_IMPL(char, 16) +QUANTIZE_IMPL(ushort, 16) +QUANTIZE_IMPL(short, 16) +QUANTIZE_IMPL(uint, 16) +QUANTIZE_IMPL(int, 16) + +DEQUANTIZE_IMPL(uchar, 1) +DEQUANTIZE_IMPL(char, 1) +DEQUANTIZE_IMPL(uint, 1) +DEQUANTIZE_IMPL(int, 1) +DEQUANTIZE_IMPL(uchar, 4) +DEQUANTIZE_IMPL(ushort, 4) +DEQUANTIZE_IMPL(short, 4) +DEQUANTIZE_IMPL(uchar, 16) +DEQUANTIZE_IMPL(char, 16) +DEQUANTIZE_IMPL(ushort, 16) +DEQUANTIZE_IMPL(short, 16) +DEQUANTIZE_IMPL(uint, 16) +DEQUANTIZE_IMPL(int, 16) + +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) +ASYMM_MULT_IMPL(1) ASYMM_MULT_IMPL(2) ASYMM_MULT_IMPL(4) ASYMM_MULT_IMPL(8) @@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) +ASYMM_SELECT_USING_MASK_IMPL(1) ASYMM_SELECT_USING_MASK_IMPL(2) ASYMM_SELECT_USING_MASK_IMPL(4) ASYMM_SELECT_USING_MASK_IMPL(8) ASYMM_SELECT_USING_MASK_IMPL(16) +ASYMM_MASK_IF_ZERO_IMPL(1) ASYMM_MASK_IF_ZERO_IMPL(2) ASYMM_MASK_IF_ZERO_IMPL(4) ASYMM_MASK_IF_ZERO_IMPL(8) ASYMM_MASK_IF_ZERO_IMPL(16) +ASYMM_MASK_IF_NON_ZERO_IMPL(1) ASYMM_MASK_IF_NON_ZERO_IMPL(2) ASYMM_MASK_IF_NON_ZERO_IMPL(4) ASYMM_MASK_IF_NON_ZERO_IMPL(8) @@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) @@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) +ASYMM_RESCALE_IMPL(1) ASYMM_RESCALE_IMPL(2) ASYMM_RESCALE_IMPL(4) ASYMM_RESCALE_IMPL(8) ASYMM_RESCALE_IMPL(16) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) + #endif // ARM_COMPUTE_HELPERS_ASYMM_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl deleted file mode 100644 index 12c8eeb79..000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * @note Can only take floating point data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported Data - * types : F16/F32 - * @param[in] input1_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[in] alpha_ptr Pointer to the source image. Supported Data - * types : F16/F32 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source - * image - * - * @param[out] output_ptr Pointer to the destination image. Supported - * data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 - ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) - : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), - 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl deleted file mode 100644 index a66e107d1..000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" -#define SUB(x, y) (x) - (y) - -#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \ - defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) - -#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) -#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) -#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) -#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) -#define SELECT_TYPE VEC_INT - -/** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. - * - * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. - * -DDATA_TYPE_IN=uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. - * -DVEC_SIZE=16 - * @note Can only take uchar data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported Data - * types : QASYMM8 - * @param[in] input1_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[in] alpha_ptr Pointer to the source image. Supported Data - * types : QASYMM8 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported - * data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); - VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); - - in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN)); - alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA)); - - const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN); - const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA); - const VEC_FLOAT outf32 = - select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE)); - const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); - const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); - - VSTORE(VEC_SIZE) - (res, 0, (__global uchar *)output.ptr); -} - -#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && - // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl deleted file mode 100644 index eb612f834..000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. - * e.g. -DDEPTH_IN=16 - * @attention The value of the z-axis of input tensor depth should be given as a preprocessor - * argument using -DZ_IN=size. e.g. -DZ_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. - * -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - int out_index[4] = {0}; - int in_index[4] = {0}; - - in_index[0] = get_global_id(0); // W - in_index[1] = get_global_id(1); // H - in_index[2] = get_global_id(2) % Z_IN; // C - in_index[3] = get_global_id(2) / Z_IN; // B - - out_index[0] = in_index[0] / BLOCK_SIZE; - out_index[1] = in_index[1] / BLOCK_SIZE; - out_index[2] = - in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; - out_index[3] = in_index[3]; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], - out_index[3])) = *((__global DATA_TYPE *)in.ptr); -} -#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) - -#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. - * e.g. -DDEPTH_IN=16 - * @attention The value of the z-axis of input tensor depth should be given as a preprocessor - * argument using -DZ_IN=size. e.g. -DZ_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. - * -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data - * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * image - * @param[out] output_ptr Pointer to the destination image. Supported data - * types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension - * (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension - * (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the - * destination image - */ -__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - int out_index[4] = {0}; - int in_index[4] = {0}; - - in_index[0] = get_global_id(0); // C - in_index[1] = get_global_id(1); // W - in_index[2] = get_global_id(2) % Z_IN; // H - in_index[3] = get_global_id(2) / Z_IN; // B - - out_index[0] = - in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN; - out_index[1] = in_index[1] / BLOCK_SIZE; - out_index[2] = in_index[2] / BLOCK_SIZE; - out_index[3] = in_index[3]; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], - out_index[3])) = *((__global DATA_TYPE *)in.ptr); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) |