diff options
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/cl_kernels')
35 files changed, 1902 insertions, 999 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl new file mode 100644 index 000000000..f54c7bde3 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +#define CONST_ONE 1.f +#define DIV_OP(a, b) ((a) / (b)) +#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a))) + +// Inverse Square-root Activation +inline TYPE rsqrt_op(TYPE x) +{ + return RSQRT_OP(x); +} + +#define ACTIVATION_OP2(op, x) op##_op(x) +#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) + +#if defined(ACT) + +/** This performs an activation function floating point inputs. + * + * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH + * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void activation_layer_ex( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load data + TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr); + + // Perform activation + data = ACTIVATION_OP(ACT, data); + + // Store result + VSTORE(VEC_SIZE) + (data, 0, (__global DATA_TYPE *)output.ptr); +} + +#endif /* defined(ACT) */ diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl new file mode 100644 index 000000000..9a6921d7c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform arg_max/arg_min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: U32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs for max value index + * @param[in] dim Dimension across the axis to be reduced. + */ + +__kernel void arg_op(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + DATA_TYPE tval = value; + int idx = 0; + for(int i = 1; i < dim; ++i) + { + indices[axis] = i; + + #if OP_CODE == 1 // ArgMax + value = max(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + #elif OP_CODE == 2 //ArgMin + value = min(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + #else + return; + + #endif + + if(tval!=value) + { + idx = indices[axis]; + tval = value; + } + } + + *((__global uint *)out.ptr) = idx; +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl new file mode 100644 index 000000000..2ed698951 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifdef SATURATE +#define SUB(x, y) sub_sat((x), (y)) +#else /* SATURATE */ +#define SUB(x, y) (x) - (y) +#endif /* SATURATE */ + +/** This function subtracts one tensors from another. + * + * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * + * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16 + * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16 + * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16 + * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void arithmetic_sub_ex( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load values + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + + // Calculate and store result + vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl index 0c0a9ede6..5cd0a4309 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl @@ -2,32 +2,20 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers_asymm.h" -#if defined(FIXED_POINT_POSITION) -#include "fixed_point.h" -#endif /* FIXED_POINT_POSITION */ - #ifdef SATURATE #define ADD(x, y) add_sat((x), (y)) #define SUB(x, y) sub_sat((x), (y)) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl new file mode 100644 index 000000000..ad6a48a02 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) +/** Perform batch to space rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void batch_to_space_nd( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int out_index[4]={0}; + int in_index[4]={0}; + + out_index[0] = get_global_id(0);//W + out_index[1] = get_global_id(1);//H + out_index[2] = get_global_id(2) % DEPTH_OUT;//C + out_index[3] = get_global_id(2) / DEPTH_OUT;//N + + in_index[0] = out_index[0]/BLOCK_SIZE1; + in_index[1] = out_index[1]/BLOCK_SIZE0; + in_index[2] = out_index[2]; + in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl new file mode 100644 index 000000000..bea61f53e --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OP_CODE) && defined(DATA_TYPE) +/** returns truth value of the two input tensors for BINARY LOGICAL OP. + * where BINARY LOGICAL OP can be AND, OR. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor.Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + */ +__kernel void binary_logical_op( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + #if OP_CODE == 1 // LOGICAL AND + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); + + #elif OP_CODE == 2 // LOGICAL OR + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); + + #else // OP NOT SUPPORTED + return + + #endif +} +#endif //if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl index 113804cca..3d4675e5d 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -2,38 +2,34 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" -#ifndef SCALE_IN -#define SCALE_IN 1.0f +#ifndef SCALE +#define SCALE 1.0f +#endif +#ifndef OFFSET +#define OFFSET 0 #endif -#ifndef OFFSET_IN -#define OFFSET_IN 0 +#ifndef VEC_SIZE +#define VEC_SIZE 1 #endif +#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) /** Perform a cast operation on an input tensor. * - * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -65,9 +61,9 @@ __kernel void cast( 0, (__global DATA_TYPE_OUT *)output.ptr); } - /** Perform a cast operation on an QASYMM8 input tensor. - * + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -96,8 +92,8 @@ __kernel void cast_qasymm_in( VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; @@ -108,7 +104,8 @@ __kernel void cast_qasymm_in( /** Perform a cast operation on an QASYMM8 output tensor. - * + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -137,8 +134,8 @@ __kernel void cast_qasymm_out( VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); @@ -146,3 +143,4 @@ __kernel void cast_qasymm_out( VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); } +#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl new file mode 100644 index 000000000..765072556 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) +/** Returns truth value of comparison operators. + * Comparison operators may be equal, not_equal etc. + * + * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT, + * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void comparison_op( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + #if OP_CODE == 1 //EQUAL + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr); + + #elif OP_CODE == 2 //NOT_EQUAL + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl new file mode 100644 index 000000000..1eb305f7b --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" +#define SUB(x, y) (x) - (y) + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) + +/** Returns the truth value of comparison . + * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void comparison_op_qasymm8( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT); + VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT); + + in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1)); + in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2)); + + const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); + const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); + + #if OPCODE == 1 //EQUAL QUANTIZED + VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); + + #elif OPCODE == 2 //NOT EQUAL QUANTIZED + VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl new file mode 100644 index 000000000..fef2243e7 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void depth_to_space( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int out_index[4]={0}; + int in_index[4]={0}; + + out_index[0] = get_global_id(0);//W + out_index[1] = get_global_id(1);//H + out_index[2] = get_global_id(2) % DEPTH_OUT;//C + out_index[3] = get_global_id(2) / DEPTH_OUT;//B + + in_index[0] = out_index[0]/BLOCK_SIZE; + in_index[1] = out_index[1]/BLOCK_SIZE; + in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; + in_index[3] = out_index[3]; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl new file mode 100644 index 000000000..348458fe9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform embedding_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector + */ + +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + //lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) + :get_global_id(0); + lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) + :get_global_id(1); + lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) + :get_global_id(2)%DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + :get_global_id(2) / DEPTH_OUT; + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl new file mode 100644 index 000000000..69d94f30a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Perform an exponential operation on an input tensor. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void exp_layer( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h deleted file mode 100644 index 7807533e2..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h +++ /dev/null @@ -1,565 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_FIXED_POINT_H -#define ARM_COMPUTE_FIXED_POINT_H - -#define TYPE_ALIAS(type, alias) \ - typedef type alias; \ - typedef type alias##x##1; \ - typedef type##2 alias##x##2; \ - typedef type##3 alias##x##3; \ - typedef type##4 alias##x##4; \ - typedef type##8 alias##x##8; \ - typedef type##16 alias##x##16; - -TYPE_ALIAS(char, qs8) -TYPE_ALIAS(short, qs16) -TYPE_ALIAS(int, qs32) - -#define qs8_MIN ((char)CHAR_MIN) -#define qs8_MAX ((char)CHAR_MAX) -#define qs16_MIN ((short)SHRT_MIN) -#define qs16_MAX ((short)SHRT_MAX) -#define qs32_MIN ((int)INT_MIN) -#define qs32_MAX ((int)INT_MAX) - -#define qu8_MIN ((uchar)0) -#define qu8_MAX ((uchar)UCHAR_MAX) -#define qu16_MIN ((ushort)0) -#define qu16_MAX ((ushort)USHRT_MAX) -#define qu32_MIN ((uint)0) -#define qu32_MAX ((uint)UINT_MAX) - -#define qs8_TYPE char -#define qs8x1_TYPE char -#define qs8x2_TYPE char2 -#define qs8x3_TYPE char3 -#define qs8x4_TYPE char4 -#define qs8x8_TYPE char8 -#define qs8x16_TYPE char16 - -#define qs16_TYPE short -#define qs16x1_TYPE short -#define qs16x2_TYPE short2 -#define qs16x3_TYPE short3 -#define qs16x4_TYPE short4 -#define qs16x8_TYPE short8 -#define qs16x16_TYPE short16 - -#define qs32_TYPE int -#define qs32x1_TYPE int -#define qs32x2_TYPE int2 -#define qs32x3_TYPE int3 -#define qs32x4_TYPE int4 -#define qs32x8_TYPE int8 -#define qs32x16_TYPE int16 - -/* All internal constants are represented in the maximum supported fixed point format (QS16), - * thus we define an additional shift parameter required to convert the constant - * from the maximum supported format to the require one. - */ -#define qs8_SHIFT 8 -#define qs16_SHIFT 0 - -#undef VEC_DATA_TYPE_STR -#undef VEC_DATA_TYPE -#undef CONVERT_STR -#undef CONVERT -#undef CONVERT_SAT_STR -#undef CONVERT_SAT - -#define VEC_DATA_TYPE_STR(type, size) type##x##size -#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) - -#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x))) -#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype) -#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE) -#define CONVERT(x, type) CONVERT_STR(x, type) - -#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x))) -#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype) -#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) - -/** Computes saturating absolute value of fixed point vector. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point absolute value. - */ -#define ABSQ_SAT_IMPL(type) \ - inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); } - -ABSQ_SAT_IMPL(qs8x16) -ABSQ_SAT_IMPL(qs16x8) - -#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a)) -#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size) - -/** Computes max of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point maximum. - */ -#define MAXQ_IMPL(type) \ - inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); } - -MAXQ_IMPL(qs8x1) -MAXQ_IMPL(qs8x2) -MAXQ_IMPL(qs8x4) -MAXQ_IMPL(qs8x8) -MAXQ_IMPL(qs8x16) -MAXQ_IMPL(qs16x1) -MAXQ_IMPL(qs16x2) -MAXQ_IMPL(qs16x4) -MAXQ_IMPL(qs16x8) -MAXQ_IMPL(qs16x16) - -#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b)) -#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size) - -/** Computes saturated addition of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point addition. The result is saturated in case of overflow - */ -#define ADDQ_SAT_IMPL(type) \ - inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); } - -ADDQ_SAT_IMPL(qs8x1) -ADDQ_SAT_IMPL(qs8x2) -ADDQ_SAT_IMPL(qs8x4) -ADDQ_SAT_IMPL(qs8x8) -ADDQ_SAT_IMPL(qs8x16) -ADDQ_SAT_IMPL(qs16x1) -ADDQ_SAT_IMPL(qs16x2) -ADDQ_SAT_IMPL(qs16x4) -ADDQ_SAT_IMPL(qs16x8) -ADDQ_SAT_IMPL(qs16x16) -ADDQ_SAT_IMPL(qs32x1) -ADDQ_SAT_IMPL(qs32x2) -ADDQ_SAT_IMPL(qs32x4) -ADDQ_SAT_IMPL(qs32x8) -ADDQ_SAT_IMPL(qs32x16) - -#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b)) -#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size) - -/** Computes saturated subtraction of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point subtraction. The result is saturated in case of overflow - */ -#define SUBQ_SAT_IMPL(type) \ - inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); } - -SUBQ_SAT_IMPL(qs8x1) -SUBQ_SAT_IMPL(qs8x2) -SUBQ_SAT_IMPL(qs8x4) -SUBQ_SAT_IMPL(qs8x8) -SUBQ_SAT_IMPL(qs8x16) -SUBQ_SAT_IMPL(qs16x1) -SUBQ_SAT_IMPL(qs16x2) -SUBQ_SAT_IMPL(qs16x4) -SUBQ_SAT_IMPL(qs16x8) -SUBQ_SAT_IMPL(qs16x16) - -#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b)) -#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size) - -/* Multiply of two fixed point numbers - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiplication. - */ -#define MULQ_IMPL(type, itype) \ - inline type mul_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype round_val = (itype)(1 << (fixed_point_position - 1)); \ - itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \ - return CONVERT((res >> (itype)fixed_point_position), type); \ - } - -MULQ_IMPL(qs8x8, qs16x8) -MULQ_IMPL(qs16x8, qs32x8) -MULQ_IMPL(qs8x16, qs16x16) -MULQ_IMPL(qs16x16, qs32x16) - -#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position)) -#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position) - -/* Saturate multiply of two fixed point numbers - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiplication. The result is saturated in case of overflow - */ -#define MULQ_SAT_IMPL(type, itype) \ - inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype round_val = (itype)(1 << (fixed_point_position - 1)); \ - itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \ - return CONVERT_SAT((res >> (itype)fixed_point_position), type); \ - } - -MULQ_SAT_IMPL(qs8x1, qs16x1) -MULQ_SAT_IMPL(qs8x2, qs16x2) -MULQ_SAT_IMPL(qs8x3, qs16x3) -MULQ_SAT_IMPL(qs8x4, qs16x4) -MULQ_SAT_IMPL(qs8x8, qs16x8) -MULQ_SAT_IMPL(qs8x16, qs16x16) -MULQ_SAT_IMPL(qs16x1, qs32x1) -MULQ_SAT_IMPL(qs16x2, qs32x2) -MULQ_SAT_IMPL(qs16x3, qs32x3) -MULQ_SAT_IMPL(qs16x4, qs32x4) -MULQ_SAT_IMPL(qs16x8, qs32x8) -MULQ_SAT_IMPL(qs16x16, qs32x16) - -#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \ - mul_sat_##type##x##size((a), (b), (position)) -#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \ - MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) - -/** Saturate multiply-accumulate - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiply-accumulate. The result is saturated in case of - * overflow - */ -#define MLAQ_SAT_IMPL(type, itype) \ - type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \ - { \ - itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \ - (itype)(1 << (fixed_point_position - 1))); \ - return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \ - } - -MLAQ_SAT_IMPL(qs8x8, qs16x8) -MLAQ_SAT_IMPL(qs8x16, qs16x16) -MLAQ_SAT_IMPL(qs16x8, qs32x8) - -#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \ - mla_sat_##type##x##size((a), (b), (c), (position)) -#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \ - MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) - -/** Saturate multiply-accumulate long - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiply-accumulate long. The result is saturated in case - * of overflow - */ -#define MLALQ_SAT_IMPL(type, itype) \ - itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \ - { \ - itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \ - (itype)(1 << (fixed_point_position - 1))); \ - return add_sat(VopA, res >> (itype)fixed_point_position); \ - } - -MLALQ_SAT_IMPL(qs8x8, qs16x8) -MLALQ_SAT_IMPL(qs16x8, qs32x8) - -#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \ - mlal_sat_##type##x##size((a), (b), (c), (position)) -#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \ - MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) - -/** Saturate division of two fixed point vectors - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point division. The result is saturated in case of overflow - */ -#define DIVQ_SAT_IMPL(stype, type, itype) \ - inline type div_sat_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype conv_a = CONVERT((VopA), itype); \ - itype denominator = CONVERT((VopB), itype); \ - itype numerator = conv_a << (itype)(fixed_point_position); \ - itype res = select((itype)(numerator / denominator), \ - select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \ - (itype)(denominator == (itype)0)); \ - return CONVERT_SAT((res), type); \ - } - -DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16) -DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8) -DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16) -DIVQ_SAT_IMPL(qs8, qs8, qs16) -DIVQ_SAT_IMPL(qs16, qs16, qs32) - -#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position)) -#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position) - -#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \ - div_sat_##type##x##size((a), (b), (position)) -#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \ - DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) - -/** Saturate exponential of a fixed point vector - * - * @note Implemented approach uses taylor polynomial to approximate the exponential function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point exponential. The result is saturated in case of overflow - */ -#define EXPQ_IMPL(stype, type, size) \ - inline type exp_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1); \ - type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one; \ - type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1); \ - type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1); \ - type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1); \ - type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1); \ - type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position); \ - type dec_m = m >> (type)fixed_point_position; \ - type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, \ - fixed_point_position); \ - alpha = CONVERT(abs_diff(VopA, alpha), type); \ - type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \ - return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), \ - clz(sum) > dec_m); /* Saturate result if needed */ \ - } - -EXPQ_IMPL(qs8, qs8x2, 2) -EXPQ_IMPL(qs8, qs8x4, 4) -EXPQ_IMPL(qs8, qs8x8, 8) -EXPQ_IMPL(qs8, qs8x16, 16) -EXPQ_IMPL(qs16, qs16x2, 2) -EXPQ_IMPL(qs16, qs16x4, 4) -EXPQ_IMPL(qs16, qs16x8, 8) -EXPQ_IMPL(qs16, qs16x16, 16) - -#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position)) -#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position) - -/** Saturate logarithm of a fixed point vector - * - * @note Implemented approach uses taylor polynomial to approximate the logarithm function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point logarithm. The result is saturated in case of overflow - */ -#define LOGQ_IMPL(stype, type, size) \ - inline type log_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */ \ - type A = (type)(0x5C0F >> (14 - fixed_point_position)); /* 1.4384189 */ \ - type B = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */ \ - type C = (type)(0x2933 >> (15 - fixed_point_position)); /* 0.3218538 */ \ - type D = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */ \ - type inter_a = \ - select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \ - VopA < const_one); \ - type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position); \ - inter_a = inter_a >> shift_val; \ - inter_a = sub_sat(inter_a, const_one); \ - type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C); \ - sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B); \ - sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A); \ - sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position); \ - sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, \ - size, fixed_point_position); \ - return select(select(sum, -sum, VopA < const_one), (type)0, \ - VopA < (type)0); /* Saturate result if needed */ \ - } - -LOGQ_IMPL(qs8, qs8x16, 16) -LOGQ_IMPL(qs16, qs16x8, 8) -LOGQ_IMPL(qs16, qs16x16, 16) - -#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position)) -#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position) - -/** Saturate inverse square root of a fixed point vector - * - * @note Implemented approach uses Newton's method to approximate the inverse square root function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point inverse square root. The result is saturated in case of - * overflow - */ -#define INVSQRTQ_IMPL(stype, type, size) \ - inline type invsqrt_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_three = (type)(3 << (fixed_point_position)); \ - type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \ - type temp = select((type)(VopA >> shift_value), \ - select((type)stype##_MAX, (type)(VopA << (-shift_value)), \ - (type)(clz(VopA) > (-shift_value))), \ - (type)(shift_value < (type)0)); \ - type x = temp; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */ \ - { \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - } \ - type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \ - return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), \ - (type)(clz(x) > shift_value2)), \ - (type)(shift_value < (type)0)); /* Saturate result if needed */ \ - } - -INVSQRTQ_IMPL(qs8, qs8x1, 1) -INVSQRTQ_IMPL(qs16, qs16x1, 1) -INVSQRTQ_IMPL(qs8, qs8x16, 16) -INVSQRTQ_IMPL(qs16, qs16x8, 8) - -#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position)) -#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position) - -/** Saturate hyperbolic tangent of a fixed point vector - * - * tanh(x) = (e^2x - 1)/(e^2x + 1) - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of - * overflow - */ -#define TANHQ_IMPL(stype, type, size) \ - inline type tanh_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type const_two = (type)(2 << (fixed_point_position)); \ - type exp2x = \ - EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \ - stype, size, fixed_point_position); \ - type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size); \ - type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size); \ - return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position); \ - } - -TANHQ_IMPL(qs8, qs8x16, 16) -TANHQ_IMPL(qs16, qs16x8, 8) - -#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position)) -#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position) - -#define floatx16 float16 -#define float16_TYPE float16 - -#define CONVERTQ_DOWN_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \ - { \ - return CONVERT(a * (1 << fixed_point_position) + \ - select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \ - out_type); \ - } - -CONVERTQ_DOWN_IMPL(float16, qs8x16) -CONVERTQ_DOWN_IMPL(float16, qs16x16) - -#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \ - { \ - return CONVERT_SAT(a * (1 << fixed_point_position) + \ - select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \ - out_type); \ - } - -CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16) -CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16) - -#define CONVERTQ_UP_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \ - { \ - return CONVERT(a, out_type) / (1 << fixed_point_position); \ - } - -CONVERTQ_UP_IMPL(qs8x16, float16) -CONVERTQ_UP_IMPL(qs16x16, float16) - -#define SQCVT_SAT_IMPL(type) \ - inline type sqcvt_##type##_sat(float a, int fixed_point_position) \ - { \ - return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \ - } - -SQCVT_SAT_IMPL(qs8) -SQCVT_SAT_IMPL(qs16) - -#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position)) -#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position) - -#endif // ARM_COMPUTE_FIXED_POINT_H diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl index 25e20f5f2..6b767d6c9 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl new file mode 100644 index 000000000..ed7409852 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform hashtable_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector + */ +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) + :get_global_id(0); + lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) + :get_global_id(1); + lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) + :get_global_id(2)%DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + :get_global_id(2) / DEPTH_OUT; + + if (lup_id[NUM_DIMS-1] < 0) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); + return; + } + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h index 8143d2398..0e123ae0a 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -24,15 +24,23 @@ #ifndef ARM_COMPUTE_HELPER_H #define ARM_COMPUTE_HELPER_H -#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) -#if defined(ARM_COMPUTE_DEBUG_ENABLED) -#if defined(cl_arm_printf) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) #pragma OPENCL EXTENSION cl_arm_printf : enable -#endif // defined(cl_arm_printf) -#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) #define EXPAND(x) x @@ -175,7 +183,7 @@ typedef struct Tensor4D * * @return An image object */ -Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, +inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) { Vector vector = { @@ -201,7 +209,7 @@ Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ * * @return An image object */ -Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) { Image img = {.ptr = ptr, @@ -230,7 +238,7 @@ Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el * * @return A 3D tensor object */ -Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) @@ -261,7 +269,7 @@ Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, * * @return A 3D tensor object */ -Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) @@ -276,7 +284,7 @@ Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, return tensor; } -Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, uint step_w, uint mod_size) @@ -299,7 +307,7 @@ Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, * @param[in] vec Pointer to the starting position of the buffer * @param[in] x Relative X position */ -__global inline const uchar *vector_offset(const Vector *vec, int x) +inline __global const uchar *vector_offset(const Vector *vec, int x) { return vec->ptr + x * vec->stride_x; } @@ -310,7 +318,7 @@ __global inline const uchar *vector_offset(const Vector *vec, int x) * @param[in] x Relative X position * @param[in] y Relative Y position */ -__global inline uchar *offset(const Image *img, int x, int y) +inline __global uchar *offset(const Image *img, int x, int y) { return img->ptr + x * img->stride_x + y * img->stride_y; } @@ -322,7 +330,7 @@ __global inline uchar *offset(const Image *img, int x, int y) * @param[in] y Relative Y position * @param[in] z Relative Z position */ -__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) { return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; } @@ -335,7 +343,7 @@ __global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int * @param[in] z Relative Z position * @param[in] w Relative W position */ -__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) +inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) { return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl new file mode 100644 index 000000000..e3aa463db --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Performs a negation of input tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * + * @param[in] in_ptr Pointer to the source image. Supported data types: S16/S32/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + */ +__kernel void neg_tensor( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl new file mode 100644 index 000000000..ecf4696e9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * + * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for + * batch), Top(for height), Left(for width) and Front(for depth) are + * required. Supported data type: S32 + */ + +__kernel void pad( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int4 pad_values) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int index[4]={0}; + + index[0] = get_global_id(0);//W + index[1] = get_global_id(1);//H + index[2] = get_global_id(2) % DEPTH_OUT;//C + index[3] = get_global_id(2) / DEPTH_OUT;//N + + if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) || + index[1] < pad_values.y || index[1] >= (IH + pad_values.y) || + index[2] < pad_values.z || index[2] >= (ID + pad_values.z) || + index[3] < pad_values.w || index[3] >= (IB + pad_values.w)) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *) + tensor4D_offset(&in, index[0] - pad_values.x, + index[1] - pad_values.y, + index[2] - pad_values.z, + index[3] - pad_values.w)); + } + } + +#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl new file mode 100644 index 000000000..7cc8b0354 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) +/** Perform a Generic permute operation on an input tensor of Shape DCHW. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1 +6/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in b +ytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in b +ytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in b +ytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void permute_generic( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4]; + int in_index[4]; + in_index[0] = get_global_id(0);//W + in_index[1] = get_global_id(1);//H + in_index[2] = get_global_id(2) % DEPTH_IN;//C + in_index[3] = get_global_id(2) / DEPTH_IN;//B + out_index[0] = in_index[P1]; + out_index[1] = in_index[P2]; + out_index[2] = in_index[P3]; + out_index[3] = in_index[P4]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl index 512c62023..aa05121b1 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl index 82edf3b1d..fdfb78003 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl @@ -2,40 +2,20 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" -#if defined(FIXED_POINT_POSITION) - -#include "fixed_point.h" - -#if defined(SATURATE) -#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) -#else // SATURATE -#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) -#endif // SATURATE - -#else // FIXED_POINT_POSITION - #if defined(SATURATE) #define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x)) #else // SATURATE @@ -45,17 +25,14 @@ #define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size) -#endif // FIXED_POINT_POSITION - /** Performs a pixelwise division with integer scale of integer inputs. * * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. - * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3 * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16 + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/S16 * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) @@ -79,7 +56,7 @@ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1). + * @param[in] scale Integer scaling factor. Supported data types: S32 */ __kernel void pixelwise_div_int( TENSOR3D_DECLARATION(in1), diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl index ddc9d5a27..ab1307e64 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers_asymm.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl new file mode 100644 index 000000000..68da2ba32 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data types : F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] alpha_ptr Pointer to the source image. Supported Data types : F16/F32 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void prelu( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ? + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) : + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), + 0, (__global DATA_TYPE *)output.ptr); + +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl new file mode 100644 index 000000000..7e97b7ed6 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" +#define SUB(x, y) (x) - (y) + +#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) + +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take uchar data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data types : QASYMM8 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] alpha_ptr Pointer to the source image. Supported Data types : QASYMM8 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void prelu_qasymm8( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); + VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); + + in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1)); + in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2)); + + const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); + const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); + const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32; + const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); + const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); + + VSTORE(VEC_SIZE) + (res, 0, (__global uchar *)output.ptr); +} + +#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl deleted file mode 100644 index dfa3b85f4..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#if defined(WIDTH) -/** Perform reduce max - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void reduce_max(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(output)) -{ - Vector input = CONVERT_TO_VECTOR_STRUCT(input); - Vector output = CONVERT_TO_VECTOR_STRUCT(output); - - __global float *input_addr = (__global float *)(input.ptr); - __global float *output_addr = (__global float *)(output.ptr); - - float max_value = *input_addr; - for(int x = 1; x < WIDTH; x++) - { - float value = *(input_addr + x); - max_value = max(value, max_value); - } - - // Store max - *output_addr = max_value; -} -#endif // defined(WIDTH) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl new file mode 100644 index 000000000..8bef49363 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform reduce max/min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + for(int i = 1; i < dim; ++i) + { + indices[axis] = i; + + #if OP_CODE == 1 // REDUCE_MAX + value = max(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + + #elif OP_CODE == 2 // REDUCE_MIN + value = min(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + + #else // OP NOT SUPPORTED + return; + + #endif + } + + *((__global DATA_TYPE *)out.ptr) = value; +} + +/** Perform reduce sum/mean + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE sum_value = (DATA_TYPE)0; + for(int i = 0; i < dim; ++i) + { + indices[axis] = i; + sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + } + + #if OP_CODE == 3 // REDUCE_SUM + *((__global DATA_TYPE *)out.ptr) = sum_value; + + #elif OP_CODE == 4 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl deleted file mode 100644 index 1a96eea61..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -inline DATA_TYPE sum_8(__global const DATA_TYPE *input) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - in = vload8(0, input); - in.s0123 += in.s4567; - in.s01 += in.s23; - return ((in.s0 + in.s1)); -} - -/** This function calculates the sum and sum of squares of a given input image. - * - * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] local_sum Local sum of all elements - * @param[in] height Height of the input image - * @param[in] divider Divider to calculate mean - */ -__kernel void reduction_mean( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst), - __local DATA_TYPE *local_sums, - int height, - int divider) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - float8 tmp_sum = 0; - // Calculate partial sum - - for(int i = 0; i < height; i++) - { - local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i)); - } - ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider; -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl new file mode 100644 index 000000000..a0fc2d5a9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) +/** Perform space to batch with input of 4D and NCHW format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = {0, }; + in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[2] = get_global_id(2) % DEPTH_OUT; + in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; + + if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); + } +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) + +#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) +/** Perform space to batch with input of 4D and NHWC format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = {0, }; + in_index[0] = get_global_id(0) * VEC_SIZE; + in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; + + if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); + } + else + { + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); + } +} + +#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl new file mode 100644 index 000000000..f6977045a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_depth( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4]={0}; + int in_index[4]={0}; + + in_index[0] = get_global_id(0);//W + in_index[1] = get_global_id(1);//H + in_index[2] = get_global_id(2) % DEPTH_IN;//C + in_index[3] = get_global_id(2) / DEPTH_IN;//B + + out_index[0] = in_index[0]/BLOCK_SIZE; + out_index[1] = in_index[1]/BLOCK_SIZE; + out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; + out_index[3] = in_index[3]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl new file mode 100644 index 000000000..3e1a5c97f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Returns true value of squared_difference of two tensors. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] input2_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void squared_difference( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + sq_diff = diff * diff; + + VSTORE(VEC_SIZE) + (sq_diff, 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl deleted file mode 100644 index c5ff82f9e..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - - -inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w) -{ - int stride_x = vector->stride_x; - int stride_y = stride_x * dim_x; - int stride_z = stride_y * dim_y; - int stride_w = stride_z * dim_z; - Tensor4D tensor = - { - .ptr = vector->ptr, - .offset_first_element_in_bytes = vector->offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z, - .stride_w = stride_w, - }; - return tensor; -} - -/** Extracts a strided slice up to 4-dimensions - * - * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short - * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2 - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] dims_in The 4-dimensional dimension of the input. Supported data types: S32 - * @param[in] dims_out The 4-dimensional dimension of the output. Supported data types: S32 - * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 - * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 - */ -__kernel void strided_slice(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(output), - const int4 dims_in, - const int4 dims_out, - const int4 starts, - const int4 strides) -{ - // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset - Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); - Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); - - // Implemenation - // Infer a Tensor4D from output Vector and output's dimensions info - // Infer a Tensor4D from input Vector and input's dimensions info - // Infer indices of output as 4D from the offset of output vector - // Infer indices of input as 4D from indices of output - // out(offset of output vector) = in(offset of input) - - Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w); - Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w); - - // Must be output_step_x == output_stride_x == an element's size - const int offset_out = get_global_id(0) * output_stride_x; - int4 indices_out = - { - get_global_id(0) % dims_out.x, - (offset_out / tensor_out.stride_y) % dims_out.y, - (offset_out / tensor_out.stride_z) % dims_out.z, - (offset_out / tensor_out.stride_w) % dims_out.w, - }; - - int4 indices_in = - { - starts.x + (strides.x * indices_out.x), - starts.y + (strides.y * indices_out.y), - starts.z + (strides.z * indices_out.z), - starts.w + (strides.w * indices_out.w), - }; - - *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl new file mode 100644 index 000000000..b39c55b96 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) +/** Extracts a strided slice up to 4-dimensions + * + * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 + * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 + */ +__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int4 starts, + const int4 strides) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int4 indices_in = + { + starts.x + (strides.x * get_global_id(0)), + starts.y + (strides.y * get_global_id(1)), + starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)), + starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)), + }; + *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); +} +#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl index 0b0cf8218..d97f23a47 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl index deadf8412..0292fab04 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl index cac0c071e..c2c2d89a4 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ // reference: |