summaryrefslogtreecommitdiff
path: root/compute/ARMComputeEx/src/core
diff options
context:
space:
mode:
Diffstat (limited to 'compute/ARMComputeEx/src/core')
-rw-r--r--compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp39
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl137
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl191
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl233
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl185
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h206
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h185
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl120
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl138
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl185
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp181
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp132
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp140
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp372
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp3
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp2
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp210
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp3
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp148
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp188
-rw-r--r--compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp118
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp671
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp181
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp221
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp291
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp2
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp181
34 files changed, 387 insertions, 4283 deletions
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 7b6b9742b..ba42a2456 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -55,16 +55,7 @@ using namespace arm_compute;
const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
// ARMComputeEx kernels
- {"arg_op", "arg_operation.cl"},
- {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
{"binary_logical_op", "binary_logical_op.cl"},
- {"cast", "cast.cl"},
- {"cast_qasymm_in", "cast.cl"},
- {"cast_qasymm_out", "cast.cl"},
- {"comparison_op", "comparison_op.cl"},
- {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
- {"depth_to_space_nchw", "depth_to_space.cl"},
- {"depth_to_space_nhwc", "depth_to_space.cl"},
{"embedding_lookup", "embedding_lookup.cl"},
{"gather_ex", "gather_ex.cl"},
{"gather_ex_1d", "gather_ex.cl"},
@@ -74,10 +65,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
{"instance_normalization_ex", "instance_normalization_ex.cl"},
{"multiply_scale_factor", "multiply_scale_factor.cl"},
{"neg_tensor", "neg_tensor.cl"},
- {"permute_generic", "permute_ex.cl"},
- {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
- {"prelu", "prelu.cl"},
- {"prelu_qasymm8", "prelu_quantized.cl"},
{"quantization_symm8", "quantization_symm8.cl"},
{"reduce_min_max", "reduce_operation.cl"},
{"reduce_sum_mean", "reduce_operation.cl"},
@@ -91,29 +78,15 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
{"radixsort_reorder", "topkv2_radixsort.cl"},
{"topkv2_quicksort", "topkv2_quicksort.cl"},
{"scale_factor_symm8", "scale_factor.cl"},
- {"space_to_depth_nchw", "space_to_depth.cl"},
- {"space_to_depth_nhwc", "space_to_depth.cl"},
};
const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
#ifdef EMBEDDED_KERNELS
{
- "arg_operation.cl",
-#include "./cl_kernels/arg_operation.clembed"
- },
- {
- "cast.cl",
-#include "./cl_kernels/cast.clembed"
- },
- {
"embedding_lookup.cl",
#include "./cl_kernels/embedding_lookup.clembed"
},
{
- "depth_to_space.cl",
-#include "./cl_kernels/depth_to_space.clembed"
- },
- {
"gather_ex.cl",
#include "./cl_kernels/gather_ex.clembed"
},
@@ -150,14 +123,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
#include "./cl_kernels/neg_tensor.clembed"
},
{
- "prelu.cl",
-#include "./cl_kernels/prelu.clembed"
- },
- {
- "prelu_quantized.cl",
-#include "./cl_kernels/prelu_quantized.clembed"
- },
- {
"quantization_symm8.cl",
#include "./cl_kernels/quantization_symm8.clembed"
},
@@ -170,10 +135,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
#include "./cl_kernels/scale_factor.clembed"
},
{
- "space_to_depth.cl",
-#include "./cl_kernels/space_to_depth.clembed"
- },
- {
"topkv2.cl",
#include "./cl_kernels/topkv2.clembed"
},
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
deleted file mode 100644
index 03717cfe9..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform arg_max/arg_min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
- * e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- * e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as
- * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types:
- * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension
- * (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension
- * (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element
- * in the source image
- * @param[in] input_stride_w Stride of the source tensor in W dimension
- * (in bytes)
- * @param[in] input_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[out] output_ptr Pointer to the destination image.
- * Supported data types: U32
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension
- * (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- * @param[in] axis Axis through which reduction occurs
- * @param[in] dim Dimension across the axis to be reduced.
- */
-
-__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
- const int dim)
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int indices[4] = {
- get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
- get_global_id(2) / DEPTH_OUT,
- };
-
- DATA_TYPE value =
- *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
- DATA_TYPE tval = value;
- int idx = 0;
- for (int i = 1; i < dim; ++i)
- {
- indices[axis] = i;
-
-#if OP_CODE == 1 // ArgMax
- value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
- indices[2], indices[3])));
-#elif OP_CODE == 2 // ArgMin
- value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
- indices[2], indices[3])));
-#else
- return;
-
-#endif
-
- if (tval != value)
- {
- idx = indices[axis];
- tval = value;
- }
- }
-
- *((__global uint *)out.ptr) = idx;
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
deleted file mode 100644
index f74c1c103..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
- * QASYMM8
- *
- * The following computations will be performed:
- *
- * -# Add offset terms to inputs
- -# Get scaled value of two inputs
- * -# Add inputs
- * -# Add offset terms to final result
- * -# Multiply each entry of result by result_mult_int
- * -# Shift the int32 accumulator by result_shift
- * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using
- * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The number of bits to shift left of input tensors must be passed at compile time using
- * -DLEFT_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
- * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
- -DIN2_OFFSET,
- * -RIN2_MULT_INT and -DIN2_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
- * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
- -DRESULT_SHIFT
- *
- * @attention The input and output data_types need to be passed at compile time using
- * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
- * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
- * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
- * @attention The inputs and output scale offset need to be passed at compile time using
- * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
- * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
- * wrapping policy will be used.
- *
- * @param[in] in1_ptr Pointer to the source tensor.
- * Supported data types: QASYMM8
- * @param[in] in1_stride_x Stride of the source tensor in X dimension
- * (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed
- * per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types:
- * QASYMM8
- * @param[in] in2_stride_x Stride of the source tensor in X dimension
- * (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed
- * per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- * @param[out] out_ptr Pointer to the destination tensor.
- * Supported data types: QASYMM8
- * @param[in] out_stride_x Stride of the destination tensor in X dimension
- * (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination tensor in Y dimension
- * (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in] out_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Z processed
- * per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination
- * tensor
- */
-__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out))
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load data
- VEC_DATA_TYPE(int, 16)
- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
- VEC_DATA_TYPE(int, 16)
- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
- // Get scaled value of two inputs
- VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
- VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-
- VEC_DATA_TYPE(int, 16)
- left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
- VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
- VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
-
- VEC_DATA_TYPE(int, 16)
- scaled_in1_val =
- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
- VEC_DATA_TYPE(int, 16)
- scaled_in2_val =
- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
-
- // Add inputs and multiply with a multiplier smaller than 1
- VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
- VEC_DATA_TYPE(int, 16)
- out_val =
- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
- out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
- VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
- // TODO: Apply min-max BOUND to support fuse with relu.
- /*
- #if defined(MIN_BOUND)
- res = max(res, (uchar16)MIN_BOUND);
- #endif // defined(MIN_BOUND)
- #if defined(MAX_BOUND)
- res = min(res, (uchar16)MAX_BOUND);
- #endif // defined(MAX_BOUND)
- */
-
- // Store result
- VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 4147a0017..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef SCALE
-#define SCALE 1.0f
-#endif
-#ifndef OFFSET
-#define OFFSET 0
-#endif
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
-/** Perform a cast operation on an input tensor.
- *
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @attention -DBOOL_INPUT : Whether type of input is bool.
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VSTORE(VEC_SIZE)
- (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
- 0, (__global DATA_TYPE_OUT *)output.ptr);
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
- res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#if defined(BOOL_INPUT)
- VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
- VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
- res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#endif // defined(BOOL_INPUT)
-
- VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 input tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of input should be given as a preprocessor argument using
- * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
- in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
- VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
- VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
-
- VSTORE(VEC_SIZE)
- (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
- (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 output tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of output should be given as a preprocessor argument using
- * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: U8
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
- in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
- VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
- VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
-
- VSTORE(VEC_SIZE)
- (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
- (__global DATA_TYPE_OUT *)output.ptr);
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index 0285c955b..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- * e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- * using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- * -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
- int out_index[4] = {0};
- int in_index[4] = {0};
-
- out_index[0] = get_global_id(0); // W
- out_index[1] = get_global_id(1); // H
- out_index[2] = get_global_id(2) % Z_OUT; // C
- out_index[3] = get_global_id(2) / Z_OUT; // B
-
- in_index[0] = out_index[0] / BLOCK_SIZE;
- in_index[1] = out_index[1] / BLOCK_SIZE;
- in_index[2] = out_index[2] +
- ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
- in_index[3] = out_index[3];
-
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
- &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor (NHWC)
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- * e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- * using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- * -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
- int out_index[4] = {0};
- int in_index[4] = {0};
-
- out_index[0] = get_global_id(0); // C
- out_index[1] = get_global_id(1); // W
- out_index[2] = get_global_id(2) % Z_OUT; // H
- out_index[3] = get_global_id(2) / Z_OUT; // B
-
- in_index[0] = out_index[0] +
- ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
- in_index[1] = out_index[1] / BLOCK_SIZE;
- in_index[2] = out_index[2] / BLOCK_SIZE;
- in_index[3] = out_index[3];
-
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
- &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 2d0b6a299..e07a25ec9 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,7 +37,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#ifndef ARM_COMPUTE_HELPER_H
#define ARM_COMPUTE_HELPER_H
@@ -59,16 +58,219 @@
#pragma OPENCL EXTENSION cl_arm_printf : enable
#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
#define EXPAND(x) x
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each
+ * element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
#define VLOAD_STR(size) vload##size
#define VLOAD(size) VLOAD_STR(size)
#define VSTORE_STR(size) vstore##size
#define VSTORE(size) VSTORE_STR(size)
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create
+// defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
#define VEC_DATA_TYPE_STR(type, size) type##size
#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index a83b1a8a5..5f1b3f902 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,29 +37,112 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
#define ARM_COMPUTE_HELPERS_ASYMM_H
#include "helpers.h"
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+ float out_f32 = input / scale + offset;
+ uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+ return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+ return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+ return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size) \
+ inline VEC_DATA_TYPE(type, size) \
+ quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+ { \
+ VEC_DATA_TYPE(float, size) \
+ out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
+ VEC_DATA_TYPE(type, size) \
+ res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \
+ VEC_DATA_TYPE(type, size)); \
+ return res; \
+ }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size) \
+ inline VEC_DATA_TYPE(float, size) \
+ dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+ { \
+ return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \
+ }
+
/** Correctly-rounded-to-nearest division by a power-of-two.
*
* @param[in] size Size of vector.
*
* @return Correctly-rounded-to-nearest division by a power-of-two.
*/
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
- { \
- VEC_DATA_TYPE(int, size) \
- mask = (1 << exponent) - 1; \
- const VEC_DATA_TYPE(int, size) zero = 0; \
- const VEC_DATA_TYPE(int, size) one = 1; \
- VEC_DATA_TYPE(int, size) \
- threshold = (mask >> 1) + select(zero, one, x < 0); \
- return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
+ VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+ { \
+ const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \
+ const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \
+ VEC_DATA_TYPE(int, size) \
+ mask = (one << exponent) - one; \
+ VEC_DATA_TYPE(int, size) \
+ threshold = (mask >> 1) + select(zero, one, x < 0); \
+ return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
}
/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -81,9 +164,19 @@
b_64 = convert_long##size(b); \
VEC_DATA_TYPE(long, size) \
ab_64 = a_64 * b_64; \
- /* COMPMID-907 */ \
+ /* Revert COMPMID-907 */ \
+ VEC_DATA_TYPE(long, size) \
+ mask1 = 1 << 30; \
+ VEC_DATA_TYPE(long, size) \
+ mask2 = 1 - (1 << 30); \
+ VEC_DATA_TYPE(long, size) \
+ is_positive_or_zero = ab_64 >= 0; \
+ VEC_DATA_TYPE(long, size) \
+ nudge = select(mask2, mask1, is_positive_or_zero); \
+ VEC_DATA_TYPE(long, size) \
+ mask = 1ll << 31; \
VEC_DATA_TYPE(int, size) \
- ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \
+ ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \
return select(ab_x2_high32, INT_MAX, overflow); \
}
@@ -335,9 +428,18 @@
return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \
}
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) \
+ dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) \
+ DEQUANTIZE_STR(input, offset, scale, type, size)
+
#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
asymm_rounding_divide_by_POW2_##size(x, exponent)
#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+ ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
@@ -360,11 +462,53 @@
#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+ { \
+ const int left_shift = shift > 0 ? shift : 0; \
+ const int right_shift = shift > 0 ? 0 : -shift; \
+ return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \
+ right_shift, size); \
+ }
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+ multiply_by_quantized_multiplier##size(input, qmul, shift)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+ASYMM_MULT_IMPL(1)
ASYMM_MULT_IMPL(2)
ASYMM_MULT_IMPL(4)
ASYMM_MULT_IMPL(8)
@@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+ASYMM_SELECT_USING_MASK_IMPL(1)
ASYMM_SELECT_USING_MASK_IMPL(2)
ASYMM_SELECT_USING_MASK_IMPL(4)
ASYMM_SELECT_USING_MASK_IMPL(8)
ASYMM_SELECT_USING_MASK_IMPL(16)
+ASYMM_MASK_IF_ZERO_IMPL(1)
ASYMM_MASK_IF_ZERO_IMPL(2)
ASYMM_MASK_IF_ZERO_IMPL(4)
ASYMM_MASK_IF_ZERO_IMPL(8)
ASYMM_MASK_IF_ZERO_IMPL(16)
+ASYMM_MASK_IF_NON_ZERO_IMPL(1)
ASYMM_MASK_IF_NON_ZERO_IMPL(2)
ASYMM_MASK_IF_NON_ZERO_IMPL(4)
ASYMM_MASK_IF_NON_ZERO_IMPL(8)
@@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
@@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+ASYMM_RESCALE_IMPL(1)
ASYMM_RESCALE_IMPL(2)
ASYMM_RESCALE_IMPL(4)
ASYMM_RESCALE_IMPL(8)
ASYMM_RESCALE_IMPL(16)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
+
#endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
deleted file mode 100644
index 12c8eeb79..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in] input1_ptr Pointer to the source image. Supported Data
- * types : F16/F32
- * @param[in] input1_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[in] alpha_ptr Pointer to the source image. Supported Data
- * types : F16/F32
- * @param[in] alpha_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] alpha_step_x input2_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] alpha_step_y input2_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] alpha_step_z input2_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source
- * image
- *
- * @param[out] output_ptr Pointer to the destination image. Supported
- * data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VSTORE(VEC_SIZE)
- (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
- ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
- : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
- 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
deleted file mode 100644
index a66e107d1..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
- defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-#define SELECT_TYPE VEC_INT
-
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
- * -DDATA_TYPE_IN=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @note Can only take uchar data types.
- *
- * @param[in] input1_ptr Pointer to the source image. Supported Data
- * types : QASYMM8
- * @param[in] input1_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[in] alpha_ptr Pointer to the source image. Supported Data
- * types : QASYMM8
- * @param[in] alpha_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] alpha_step_x input2_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] alpha_step_y input2_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] alpha_step_z input2_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported
- * data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
- VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
-
- in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
- alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
-
- const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
- const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
- const VEC_FLOAT outf32 =
- select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
- const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
- const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
- VSTORE(VEC_SIZE)
- (res, 0, (__global uchar *)output.ptr);
-}
-
-#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
- // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index eb612f834..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- * e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- * argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- * -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- int out_index[4] = {0};
- int in_index[4] = {0};
-
- in_index[0] = get_global_id(0); // W
- in_index[1] = get_global_id(1); // H
- in_index[2] = get_global_id(2) % Z_IN; // C
- in_index[3] = get_global_id(2) / Z_IN; // B
-
- out_index[0] = in_index[0] / BLOCK_SIZE;
- out_index[1] = in_index[1] / BLOCK_SIZE;
- out_index[2] =
- in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
- out_index[3] = in_index[3];
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
- out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-
-#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- * e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- * argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- * -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- int out_index[4] = {0};
- int in_index[4] = {0};
-
- in_index[0] = get_global_id(0); // C
- in_index[1] = get_global_id(1); // W
- in_index[2] = get_global_id(2) % Z_IN; // H
- in_index[3] = get_global_id(2) / Z_IN; // B
-
- out_index[0] =
- in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
- out_index[1] = in_index[1] / BLOCK_SIZE;
- out_index[2] = in_index[2] / BLOCK_SIZE;
- out_index[3] = in_index[3];
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
- out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
deleted file mode 100644
index 06eeb5b98..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
-{
- TensorShape out_shape{input_shape};
-
- out_shape.set(axis, 1);
-
- return out_shape;
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
- ArgOperation /*op*/)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
- DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
- output->tensor_shape().num_dimensions(),
- "Input's rank is not same with output");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
- "Inputs are not broadcast compatible");
-
- const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
- "output shape's size does not match axis");
-
- const auto num_dimensions = input->tensor_shape().num_dimensions();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
- return Status{};
-}
-
-} // namespace
-
-CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
-
-void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
- ArgOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
- _input = input;
- _output = output;
- _axis = axis;
-
- std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
- output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
-
- // Construct kernel and set op_code based on type of ArgOperation as specified by object op
- std::string kernel_name = "arg_op";
- int op_code = 0;
- if (op == ArgOperation::MAX)
- {
- op_code = 1;
- }
- else if (op == ArgOperation::MIN)
- {
- op_code = 2;
- }
- else
- throw std::runtime_error("Operation not supported, yet");
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
- build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output_info, Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output_info->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const uint32_t axis, ArgOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-
- return Status{};
-}
-
-void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &shape_in = _input->info()->tensor_shape();
-
- unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
- _kernel.setArg<cl_int>(idx++, _axis);
- _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup input slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- // Copy output's shape in order to use for recovering at end of this method
- const TensorShape shape_out = _output->info()->tensor_shape();
- _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-
- // Recover output's shape of output tensor
- _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index bb5556888..fbc76f5e1 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
deleted file mode 100644
index 01ea655b4..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
- _input = input;
- _output = output;
-
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- // Set kernel build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" +
- get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- // Create kernel
- if (is_data_type_quantized_asymmetric(input->info()->data_type()))
- {
- UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
- const float scale_in = qinfo.scale;
- const int offset_in = qinfo.offset;
- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options()));
- }
- else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
- {
- UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
- const float scale_in = qinfo.scale;
- const float offset_in = qinfo.offset;
-
- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options()));
- }
- else
- {
- build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT");
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("cast", build_opts.options()));
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
deleted file mode 100644
index 389136817..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-// TODO Use this validation function
-#if 0
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const int32_t block_size)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
- "Block size should be greater than or equal to 1.");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
- "Output width should be equal to (Input width * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
- "Output height should be equal to (Input height * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
- "Input depth should be divisible by (block size * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- output->dimension(2) != input->dimension(2) / (block_size * block_size),
- "Output depth should be equal to (Input depth / (block size * block size))");
-
- return Status{};
-}
-#endif
-} // namespace
-
-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
-{
- // DO NOTHING
-}
-
-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
- const int32_t block_size)
-{
- // TODO Add validation of data_layout
- _input = input;
- _output = output;
-
- // Set kernel build options
- auto layout_out = output->info()->data_layout();
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
- auto depth = output->info()->dimension(index_depth);
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
- build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
- "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup input slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 79f5ce065..67aaf2db6 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
deleted file mode 100644
index 235e8975d..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/ToolchainSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
- const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4,
- "The number of dimensions for the matrix A must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3,
- "The number of dimensions for the matrix B must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 &&
- gemm_info.reinterpret_input_as_3d(),
- "The input1 tensor cannot have more than 2 dimensions if input0 "
- "has to be reinterpreted as 3D");
-
- const int m = gemm_info.m();
- const int n = gemm_info.n();
- const int k = gemm_info.k();
-
- ARM_COMPUTE_UNUSED(m);
- ARM_COMPUTE_UNUSED(n);
- ARM_COMPUTE_UNUSED(k);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
- if (gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) !=
- static_cast<unsigned int>(m));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
- }
-
- if (output->total_size() != 0)
- {
- const TensorInfo tensor_info_output =
- output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1,
- ITensorInfo *output,
- const GEMMReshapeInfo &gemm_info,
- ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if (reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_input_as_3d = false;
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output,
- input0->clone()
- ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info))
- .set_data_type(DataType::S32));
-
- TensorInfo tmp_info(*output);
-
- if (reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D
- // GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
- // Note: if the dot product instruction is available, the 8x2 tile has to be used
- num_elems_processed_per_iteration_x = 4;
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the
- // AccessWindowStatic
- const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2]
- : input0->tensor_shape()[1];
- const int bottom_pad =
- (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) %
- num_elems_processed_per_iteration_y;
-
- // Configure window
- win = calculate_max_window(
- tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(
- *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0),
- input0->dimension(1) + bottom_pad);
- AccessWindowStatic input1_access(
- input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- input1->dimension(1));
- AccessWindowStatic output_access(
- output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- window_changed =
- update_window_and_padding(win, input0_access,
- input1_access) || // window used by the execute_window_loop
- update_window_and_padding(
- win_out,
- output_access); // window used to update the padding requirements of output tensor
-
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse =
- std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true),
- _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1,
- ICLTensor *output,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input0->info(), input1->info(), output->info(), gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d
- ? _input0->info()->num_dimensions() - 1
- : _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(),
- gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create build options
- std::string kernel_name(" ");
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
- "-DHEIGHT_GEMM3D=" +
- support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
- "-DDEPTH_GEMM3D=" +
- support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b,
- "-DMATRIX_B_DEPTH=" +
- support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" +
- support::cpp11::to_string(num_elements_processed.x()));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" +
- support::cpp11::to_string(num_elements_processed.y()));
-
- kernel_name = "gemmlowp_mm_midgard_ex";
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
-}
-
-Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0,
- const ITensorInfo *input1,
- const ITensorInfo *output,
- const GEMMReshapeInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(
- validate_and_configure_window(input0->clone().get(), input1->clone().get(),
- output->clone().get(), gemm_info, num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if (_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if (_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- const unsigned int total_cross_plane_pad =
- _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if (_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 =
- 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- const unsigned int total_cross_plane_pad =
- _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A
- // more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution
- // operation
- if (!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++,
- static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++,
- static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++,
- static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
- } while (window.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 3a25987d0..3bfe3e407 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -45,6 +45,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
#include "arm_compute/core/UtilsEx.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index 7fbdcdaa7..930e7c944 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
@@ -110,7 +111,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
_hits = hits;
// Make _lookup_indices tensor
- _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+ _lookup_indices = support::cpp14::make_unique<CLTensor>();
_lookup_indices->allocator()->init(
TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
_lookup_indices->allocator()->allocate();
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index b45f6bb24..61c14d271 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -48,7 +48,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
-
+#include "support/StringSupport.h"
#include "support/ToolchainSupport.h"
namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index d305896ea..6b27c9917 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -49,6 +49,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index 74f7b4158..643c8b110 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
deleted file mode 100644
index 8910a7b80..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
- const TensorShape &out_shape =
- TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
- DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
- DataType::QASYMM8);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
- DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
- return Status{};
-}
-} // namespace
-
-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
-
- _input = input;
- _alpha = alpha;
- _output = output;
-
- // Create kernel
- std::string kernel_name = "prelu";
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- if (is_data_type_quantized_asymmetric(input->info()->data_type()))
- {
- build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string(
- input->info()->quantization_info().uniform().offset));
- build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string(
- alpha->info()->quantization_info().uniform().offset));
- build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string(
- output->info()->quantization_info().uniform().offset));
- build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string(
- input->info()->quantization_info().uniform().scale));
- build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string(
- alpha->info()->quantization_info().uniform().scale));
- build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(
- output->info()->quantization_info().uniform().scale));
- kernel_name += "_qasymm8";
- }
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output->info(), out_shape);
-
- if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output->info(), Format::F16);
- }
- else if (input->info()->data_type() == DataType::F32 ||
- alpha->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
- Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
-
- AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input->info()->tensor_shape();
- const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
-
- const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_input1);
- add_3D_tensor_argument(idx, _alpha, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPReLUKernel::border_size() const
-{
- const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
- const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index 2d551f654..1a7a18cfa 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -49,6 +49,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
namespace arm_compute
{
@@ -69,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_fac
// Output must always be initialized
ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
return Status{};
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index a98318323..06c2579f2 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
namespace
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index ff1904abd..8d8853c81 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -48,6 +48,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/StringSupport.h"
#include <climits>
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
deleted file mode 100644
index 64fc0384e..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const int32_t block_size)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
- "Block size should be greater than or equal to 1.");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
- "Input batch should be equal to Output batch");
-
- auto layout_out = input->data_layout();
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-
- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
- auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
- auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
- "Output depth should be equal to (input depth * block size *block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
- (input->dimension(index_height) % block_size),
- "Input height and width should be divisible by block size");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
- (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
- "Output height and width should be equal to "
- "input_height/blocksize and input_width/blocksize respectively");
-
- return Status{};
-}
-
-} // namespace
-
-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
- const int32_t block_size)
-{
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
- _input = input;
- _output = output;
-
- // Set kernel build options
- auto layout_out = input->info()->data_layout();
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
- auto depth = input->info()->dimension(index_depth);
- build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
- build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
- "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_out(slice_in);
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_out.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_in);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
deleted file mode 100644
index 61999cbd4..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
- : _input(nullptr), _output(nullptr), _inner_border(), _info()
-{
-}
-
-Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
- const ITensorInfo *output,
- const BorderSize &inner_border,
- const PadStrideInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-
- const DataLayout data_layout = input->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
- for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
- "inner_border_right must be smaller that stride_x");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
- "inner_border_top must be smaller that stride_y");
-
- return Status{};
-}
-
-void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
- const BorderSize &inner_border,
- const PadStrideInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- _input = input;
- _output = output;
- _inner_border = inner_border;
- _info = info;
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
- input->info(), output->info(), inner_border, info));
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
-
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const DataLayout data_layout = _input->info()->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- const int out_start_x = _info.pad_left();
- const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
- _info.pad_right() + _info.stride().first - 1;
- const int out_step_x = _info.stride().first;
-
- const int out_start_y = _inner_border.top + _info.pad_top();
- const int out_end_y =
- _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
- const int out_step_y = _info.stride().second;
-
- switch (data_layout)
- {
- case DataLayout::NCHW:
- {
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- Window slice_out = collapsed.first_slice_window_3D();
- slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
- slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
- Window slice_in = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (collapsed.slide_window_slice_3D(slice_in) &&
- collapsed.slide_window_slice_3D(slice_out));
- break;
- }
- case DataLayout::NHWC:
- {
- // NOTE: not collapsing in NHWC
- Window slice_out = window.first_slice_window_3D();
- slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
- slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
- Window slice_in = window.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data layout");
- }
-}
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
deleted file mode 100644
index 648afb304..000000000
--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {}
-
-bool CPPUpsampleKernelEx::is_parallelisable() const { return false; }
-
-void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output,
- const PadStrideInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- _input = input;
- _output = output;
- _info = info;
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICPPKernel::configure(win);
-}
-
-void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
- // Initialize _scaled_output buffer
- const int width_scaled = _output->info()->dimension(0);
- const int height_scaled = _output->info()->dimension(1);
- const int stride_x = _info.stride().first;
- const int stride_y = _info.stride().second;
- const int start_x = _info.pad_left();
- const int start_y = _info.pad_top();
- const int end_y = height_scaled - _info.pad_bottom();
- const int end_x = width_scaled - _info.pad_top();
- const size_t element_size = _input->info()->element_size();
-
- // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
- const uint8_t fill_value =
- _output->info()->data_type() == DataType::QASYMM8
- ? utility::clamp<uint8_t>(_output->info()->quantization_info().uniform().offset)
- : 0;
- // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
- // values in a buffer of uint8_ts
- std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value);
-
- // Create window
- Window window_out(window);
- window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
- window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
-
- // Create iterators
- Iterator in(_input, window);
- Iterator out(_output, window_out);
-
- execute_window_loop(
- window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
deleted file mode 100644
index fbb9dbca9..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- SubDataType input_subtype)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
- DataType::QASYMM8, DataType::U32,
- DataType::S32, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
- input->data_type() != DataType::U8);
-
- if (output->tensor_shape().total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
- DataType::QASYMM8, DataType::U32,
- DataType::S32, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps());
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
-
- // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
- return std::make_tuple(Status{}, win);
-}
-
-typedef struct bool8x16
-{
- uint8x16_t val;
-} bool8x16_t;
-
-static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
-
-template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
-template <> inline uint8x16_t vcast(const bool8x16_t &v)
-{
- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
- return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-}
-
-template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
-{
- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
- const uint32x4x4_t ret = {{
- vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
- vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
- vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
- vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
- }};
-
- return ret;
-}
-
-template <> inline int32x4x4_t vcast(const bool8x16_t &v)
-{
- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
- const int32x4x4_t ret = {{
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
- }};
-
- return ret;
-}
-
-template <> inline float32x4x4_t vcast(const bool8x16_t &v)
-{
- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
- const float32x4x4_t ret = {{
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
- }};
-
- return ret;
-}
-
-template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
-{
- const uint32x4x4_t ret = {{
- vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
- vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
- vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
- vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
- }};
-
- return ret;
-}
-
-template <> inline int32x4x4_t vcast(const uint8x16_t &v)
-{
- const int32x4x4_t ret = {{
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
- }};
-
- return ret;
-}
-
-template <> inline float32x4x4_t vcast(const uint8x16_t &v)
-{
- const float32x4x4_t ret = {{
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
- }};
-
- return ret;
-}
-
-template <> inline uint8x16_t vcast(const int32x4x4_t &v)
-{
- // Saturate cast
- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
- vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
-}
-
-template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
-{
- // Saturate cast
- const uint32x4x4_t ret = {{
- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
- }};
-
- return ret;
-}
-
-template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
-{
- const float32x4x4_t ret = {{
- vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
- vcvtq_f32_s32(v.val[3]),
- }};
-
- return ret;
-}
-
-template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
-{
- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
- vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
-}
-
-template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
-{
- const int32x4x4_t ret = {{
- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
- }};
-
- return ret;
-}
-
-template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
-{
- const float32x4x4_t ret = {{
- vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
- vcvtq_f32_u32(v.val[3]),
- }};
-
- return ret;
-}
-
-template <> inline uint8x16_t vcast(const float32x4x4_t &v)
-{
- // Saturate cast
- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
- vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
- vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
- vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
-}
-
-template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
-{
- const uint32x4x4_t ret = {{
- vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
- vcvtq_u32_f32(v.val[3]),
- }};
-
- return ret;
-}
-
-template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
-{
- const int32x4x4_t ret = {{
- vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
- vcvtq_s32_f32(v.val[3]),
- }};
-
- return ret;
-}
-
-template <typename T> struct cast_vector;
-template <> struct cast_vector<bool>
-{
- using type = bool8x16_t;
-};
-template <> struct cast_vector<uint8_t>
-{
- using type = uint8x16_t;
-};
-template <> struct cast_vector<uint32_t>
-{
- using type = uint32x4x4_t;
-};
-template <> struct cast_vector<int32_t>
-{
- using type = int32x4x4_t;
-};
-template <> struct cast_vector<float>
-{
- using type = float32x4x4_t;
-};
-
-template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
-{
- wrapper::vstore(ptr, v.val[0]);
- wrapper::vstore(ptr + 4, v.val[1]);
- wrapper::vstore(ptr + 8, v.val[2]);
- wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
-{
- wrapper::vstore(ptr, v);
-}
-
-inline bool8x16_t vloadq(const bool *ptr)
-{
- bool8x16_t ret;
- ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
- return ret;
-}
-
-template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
-{
- return wrapper::vloadq(ptr);
-}
-
-template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
-{
- return vloadq(ptr);
-}
-
-template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
-{
- return vld4q_u32(ptr);
-}
-
-template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
-{
- return vld4q_s32(ptr);
-}
-
-template <> inline typename cast_vector<float>::type load_input(const float *ptr)
-{
- return vld4q_f32(ptr);
-}
-
-template <typename T> inline T get_value(const T *ptr) { return *ptr; }
-
-template <> inline bool get_value(const bool *ptr)
-{
- bool ret = (*ptr != 0);
- return ret;
-}
-
-template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
-{
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win_collapsed);
- Iterator out(output, win_collapsed);
-
-#ifdef __aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else //__aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
- execute_window_loop(
- win_collapsed,
- [&](const Coordinates &) {
- const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
-
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- using from_vector = typename cast_vector<FromT>::type;
- const from_vector vin = load_input(in_ptr + x);
-
- switch (output->info()->data_type())
- {
- case DataType::U8:
- {
- using to_vector = typename cast_vector<uint8_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::QASYMM8:
- {
- using to_vector = typename cast_vector<float>::type;
- const UniformQuantizationInfo &qinfo_out =
- output->info()->quantization_info().uniform();
- const auto vf = vcast<to_vector, from_vector>(vin);
- const auto vout = vquantize(vf, qinfo_out);
- store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::U32:
- {
- using to_vector = typename cast_vector<uint32_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::S32:
- {
- using to_vector = typename cast_vector<int32_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::F32:
- {
- using to_vector = typename cast_vector<float>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- FromT val = get_value(in_ptr + x);
- switch (output->info()->data_type())
- {
- case DataType::U8:
- {
- *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
- break;
- }
- case DataType::QASYMM8:
- {
- const QuantizationInfo &qinfo_out = output->info()->quantization_info();
- const auto qval =
- quantize_qasymm8(static_cast<float>(val), qinfo_out, rounding_policy);
- *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
- break;
- }
- case DataType::U32:
- {
- *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
- break;
- }
- case DataType::S32:
- {
- *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
- break;
- }
- case DataType::F32:
- {
- *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
- }
- },
- in, out);
-}
-
-void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
-{
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win_collapsed);
- Iterator out(output, win_collapsed);
-
-#ifdef __aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else //__aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
- const auto &qinfo_in = input->info()->quantization_info().uniform();
- const auto &qinfo_out = output->info()->quantization_info().uniform();
-
- execute_window_loop(
- win_collapsed,
- [&](const Coordinates &) {
- const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
-
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- using from_vector = typename cast_vector<float>::type;
- const auto vf = wrapper::vloadq(in_ptr + x);
- const auto vin = vdequantize(vf, qinfo_in);
- switch (output->info()->data_type())
- {
- case DataType::U8:
- {
- using to_vector = typename cast_vector<uint8_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::QASYMM8:
- {
- using to_vector = typename cast_vector<float>::type;
- const auto vf = vcast<to_vector, from_vector>(vin);
- const auto vout = vquantize(vf, qinfo_out);
- store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::U32:
- {
- using to_vector = typename cast_vector<uint32_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::S32:
- {
- using to_vector = typename cast_vector<int32_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::F32:
- {
- using to_vector = typename cast_vector<float>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- qasymm8_t qval_in = *(in_ptr + x);
- const auto val = dequantize_qasymm8(qval_in, qinfo_in);
-
- switch (output->info()->data_type())
- {
- case DataType::U8:
- {
- *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
- break;
- }
- case DataType::QASYMM8:
- {
- const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy);
- *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
- break;
- }
- case DataType::U32:
- {
- *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
- break;
- }
- case DataType::S32:
- {
- *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
- break;
- }
- case DataType::F32:
- {
- *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
- }
- },
- in, out);
-}
-} // namespace
-
-NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
-{
-}
-
-void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
-
- _input = input;
- _output = output;
- _input_subtype = input_subtype;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- SubDataType input_subtype)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
- ARM_COMPUTE_RETURN_ON_ERROR(
- std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
- return Status{};
-}
-
-void NECastKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- switch (_input->info()->data_type())
- {
- case DataType::U8:
- if (_input_subtype == SubDataType::BOOL)
- {
- run_cast<bool>(_input, _output, window);
- }
- else
- {
- run_cast<uint8_t>(_input, _output, window);
- }
- break;
- case DataType::QASYMM8:
- run_cast_qasymm8(_input, _output, window);
- break;
- case DataType::U32:
- run_cast<uint32_t>(_input, _output, window);
- break;
- case DataType::S32:
- run_cast<int32_t>(_input, _output, window);
- break;
- case DataType::F32:
- run_cast<float>(_input, _output, window);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
deleted file mode 100644
index 95e269dee..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
-
- const DataLayout data_layout = input->data_layout();
- const int idx_channel =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
- 0);
- // Validate output if initialized
- if (output->total_size() != 0)
- {
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
- (block_shape * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
- (block_shape * input->tensor_shape()[idx_height]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
- : _input(nullptr), _output(nullptr), _block_shape()
-{
-}
-
-void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
-
- _input = input;
- _output = output;
- _block_shape = block_shape;
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
- ICPPKernel::configure(win);
-}
-
-Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
- return Status{};
-}
-
-void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
- const int idx_channel =
- get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const int depth_size = _input->info()->dimension(idx_channel);
- const int r = (depth_size / (_block_shape * _block_shape));
- const int element_size = _input->info()->element_size();
-
- Window slice_out = window.first_slice_window_3D();
-
- // The slice_out slice does not move
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Main loop for NCHW and NHWC
- if (_input->info()->data_layout() == DataLayout::NCHW)
- {
- Window slice_in = window.first_slice_window_2D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(slice_in,
- [&](const Coordinates &id) {
- const int x = id.x();
- const int y = id.y();
-
- const int z = id.z() % r;
- const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
- Coordinates output_coords{out_x, out_y, z, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_2D(slice_in));
- }
- else
- {
- Window slice_in = window.first_slice_window_3D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(slice_in,
- [&](const Coordinates &id) {
- const int x = id.y();
- const int y = id.z();
-
- const int z = id.x() % r;
- const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
- Coordinates output_coords{z, out_x, out_y, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_3D(slice_in));
- }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
deleted file mode 100644
index 200fc4f87..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <cstdint>
-#include <map>
-#include <string>
-
-namespace arm_compute
-{
-class Coordinates;
-
-namespace
-{
-template <ElementWiseUnaryEx op, typename ScalarType>
-inline ScalarType elementwise_op_scalar(const ScalarType &a)
-{
- switch (op)
- {
- case ElementWiseUnaryEx::NEG:
- return -a;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-template <ElementWiseUnaryEx op, typename VectorType>
-inline VectorType elementwise_op(const VectorType &a)
-{
- switch (op)
- {
- case ElementWiseUnaryEx::NEG:
- return wrapper::vneg(a);
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-template <ElementWiseUnaryEx op, typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
-{
- const int window_step_x = 16 / sizeof(ScalarType);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(in, win);
- Iterator output(out, win);
-
- execute_window_loop(win,
- [&](const Coordinates &) {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
-
- int x = window_start_x;
- for (; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(output_ptr + x,
- elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
- }
- for (; x < window_end_x; ++x)
- {
- *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
- }
- },
- input, output);
-}
-
-template <ElementWiseUnaryEx op>
-std::function<void(const ITensor *input, ITensor *output, const Window &window)>
-configure_func(const ITensor *input, ITensor *output)
-{
- std::string function_to_call("op_");
- function_to_call += string_from_data_type(input->info()->data_type()) + "_";
- function_to_call += string_from_data_type(output->info()->data_type());
-
- static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
- map_function = {
- {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
- };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
- auto it = map_function.find(function_to_call);
-
- if (it != map_function.end())
- {
- auto func = it->second;
- return [func](const ITensor *input, ITensor *output, const Window &window) {
- func(input, output, window);
- };
- }
- return nullptr;
-}
-} // namespace
-
-NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
- : _function(nullptr), _input(nullptr), _output(nullptr)
-{
-}
-
-void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
- ITensor *output)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Configure kernel window
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input->info());
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
-
- Window win = calculate_max_window(valid_region);
-
- _input = input;
- _output = output;
-
- INEKernel::configure(win);
-
- switch (op)
- {
- case ElementWiseUnaryEx::NEG:
- _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
- const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
- DataType::S32);
-
- // Validate in case of configured output
- if (output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
- }
-
- return Status{};
-}
-
-Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_UNUSED(op);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
- return Status{};
-}
-
-void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_function == nullptr);
- _function(_input, _output, window);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
deleted file mode 100644
index 641641b5a..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-namespace
-{
-
-/** Conditional element-wise operations */
-enum class ConditionalOperation
-{
- PRELU, /**< (x * y) for x < 0, x for x >= 0 */
-};
-
-template <ConditionalOperation op, typename ScalarType>
-inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
-{
- auto res = ScalarType(0);
-
- switch (op)
- {
- case ConditionalOperation::PRELU:
- res = a < 0 ? a * b : a;
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res;
-}
-
-template <ConditionalOperation op>
-inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
- QuantizationInfo qinfo)
-{
- return quantize_qasymm8(elementwise_conditional_op_scalar<op>(a, b), qinfo,
- RoundingPolicy::TO_NEAREST_UP);
-}
-
-template <ConditionalOperation op, typename VectorType>
-inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
-{
- VectorType res = {0, 0, 0, 0};
- VectorType const_0 = {0, 0, 0, 0};
-
- switch (op)
- {
- case ConditionalOperation::PRELU:
- res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
- ;
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res;
-}
-
-template <ConditionalOperation op>
-inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
- float32x4x4_t out = {{
- elementwise_conditional_op<op>(a.val[0], b.val[0]),
- elementwise_conditional_op<op>(a.val[1], b.val[1]),
- elementwise_conditional_op<op>(a.val[2], b.val[2]),
- elementwise_conditional_op<op>(a.val[3], b.val[3]),
- }};
- return out;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
- const ScalarType &broadcast_value,
- const bool reorder)
-{
- VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
- return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
- reorder ? a : broadcast_vector);
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *input1_ptr,
- const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
- }
- return x;
-}
-
-template <ConditionalOperation op>
-inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
- int window_step_x, const uint8_t *input1_ptr,
- const uint8_t *input2_ptr, uint8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2,
- float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Get inputs and compute output
- const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
- const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
- const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
- store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
- }
- return x;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
- int window_step_x,
- const ScalarType *non_broadcast_input_ptr,
- const ScalarType &broadcast_value,
- ScalarType *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
- wrapper::vstore(output_ptr + x,
- elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
- }
- return x;
-}
-
-template <ConditionalOperation op>
-inline int elementwise_conditional_op_quantized_broadcast_loop(
- int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
- float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
- float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t af =
- load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
- reorder ? af : broadcast_vector);
- store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
- }
- return x;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
- const Window &window)
-{
- elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
- &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
- &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
-}
-
-template <ConditionalOperation op>
-void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
- const Window &window)
-{
- elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
- &elementwise_conditional_op_quantized_broadcast_loop<op>,
- &elementwise_conditional_op_quantized_loop<op>);
-}
-} // namespace
-
-NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
-
- // Configure kernel window
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
-
- Window win = calculate_max_window(valid_region);
-
- _input = input;
- _alpha = alpha;
- _output = output;
- INEKernel::configure(win);
-}
-
-void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- if (_input->info()->data_type() == DataType::F32)
- {
- elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
- _output, window);
- }
- else if (_input->info()->data_type() == DataType::QASYMM8)
- {
- elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
- window);
- }
- else
- {
- ARM_COMPUTE_ERROR("Wrong Type");
- }
-}
-
-Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
- const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
-
- const TensorShape out_shape =
- TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
-
- // Checks performed when output is configured
- if (output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
- "Wrong shape for output");
- }
-
- return Status{};
-}
-
-Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
-
- return Status{};
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 6ba0f1fd4..5841f1d69 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
DataType::F32);
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
deleted file mode 100644
index 44feb200f..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
- ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
-
- // Validate output if initialized
- if (output->total_size() != 0)
- {
- const DataLayout data_layout = input->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int idx_batch =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
- output->tensor_shape()[idx_batch]);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
- 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
- output->tensor_shape().total_size());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
- : _input(nullptr), _output(nullptr), _block_shape()
-{
-}
-
-void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
-
- _input = input;
- _block_shape = block_shape;
- _output = output;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
- INEKernel::configure(win);
-}
-
-Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
- return Status{};
-}
-
-void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
- const DataLayout data_layout = _input->info()->data_layout();
- const int channel_idx =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int element_size = _input->info()->element_size();
-
- const size_t channel_size = _input->info()->dimension(channel_idx);
-
- Window slice_out = window.first_slice_window_3D();
-
- int batch_id = 0;
-
- // Main loop for NCHW and NHWC
- if (_output->info()->data_layout() == DataLayout::NCHW)
- {
- do
- {
- Iterator out(_output, slice_out);
- execute_window_loop(slice_out,
- [&](const Coordinates &id) {
- const size_t channel_id = id.z();
- const size_t in_x =
- id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
- const size_t in_y =
- id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
- const int z = channel_id % channel_size;
- Coordinates input_coords{in_x, in_y, z, batch_id};
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
- ++batch_id;
- } while (window.slide_window_slice_3D(slice_out));
- }
- else
- {
- do
- {
- Iterator out(_output, slice_out);
- execute_window_loop(slice_out,
- [&](const Coordinates &id) {
- const size_t channel_id = id.x();
- const size_t in_x =
- id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
- const size_t in_y =
- id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
- const int z = channel_id % channel_size;
- Coordinates input_coords{z, in_x, in_y, batch_id};
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
- ++batch_id;
- } while (window.slide_window_slice_3D(slice_out));
- }
-}
-} // namespace arm_compute