summaryrefslogtreecommitdiff
path: root/libs/ARMComputeEx/src/core/CL/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/kernels')
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp211
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp159
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp216
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp117
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp173
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp102
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp212
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp109
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp114
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp77
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp129
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp177
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp89
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp166
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp185
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp149
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp126
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp280
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp181
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp238
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp113
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp170
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp253
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp468
24 files changed, 0 insertions, 4214 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
deleted file mode 100644
index 1fdd2f98f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/UtilsEx.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const ActivationLayerInfoEx &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::F16, DataType::F32);
-
- // Checks performed when output is configured
- if ((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- if (output != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input);
- }
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
-
- if (output != nullptr)
- {
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
- }
- else
- {
- window_changed = update_window_and_padding(
- win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
- }
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLActivationLayerExKernel::CLActivationLayerExKernel()
- : _input(nullptr), _output(nullptr), _run_in_place(false)
-{
-}
-
-void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output,
- ActivationLayerInfoEx act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
- _run_in_place = (output == nullptr) || (output == input);
-
- if (output != nullptr)
- {
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), *input->info()->clone());
- }
-
- ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
- const DataType dt = input->info()->data_type();
- float a_const = act_info.a();
- float b_const = act_info.b();
- int a_const_int = 0;
- int b_const_int = 0;
-
- // Create quantized version of constants a, b if needed
- if (is_data_type_quantized(dt))
- {
- a_const_int =
- input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
- b_const_int =
- input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
- }
-
- // Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(
- ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation()))));
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- if (is_data_type_quantized(dt))
- {
- build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
- build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
-
- const int o1 = input->info()->quantization_info().offset;
- // Quantized value of 0 corresponds to the offset o1
- build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
-
- // Set scale and offset of the input and output if they have different quantization info
- if (is_data_type_quantized_asymmetric(dt) && output != nullptr)
- {
- const float s1 = input->info()->quantization_info().scale;
- const float s2 = output->info()->quantization_info().scale;
- const int o2 = output->info()->quantization_info().offset;
-
- if (o1 != o2 || s1 != s2)
- {
- build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
- build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
- build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
- build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
- }
- }
- }
- else
- {
- build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
- build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
- }
-
- build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
-
- // Create kernel
- std::string kernel_name = std::string("activation_layer_ex");
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- // Make sure _kernel is initialized before calling the parent's configure
- _input = input;
- _output = output;
-
- // Configure kernel window
- auto win_config =
- validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "activation_layer_ex_";
- _config_id += lower_string(string_from_data_type(dt));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ActivationLayerInfoEx &act_info)
-{
- const bool run_in_place = (output == nullptr) || (output == input);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(
- validate_and_configure_window(input->clone().get(),
- (run_in_place) ? nullptr : output->clone().get())
- .first);
-
- return Status{};
-}
-
-void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- if (!_run_in_place)
- {
- add_3D_tensor_argument(idx, _output, slice);
- }
- enqueue(queue, *this, slice, lws_hint());
- } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
deleted file mode 100644
index c1a2ad0be..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis)
-{
- TensorShape out_shape{input_shape};
-
- out_shape.set(argminmax_axis, 1);
-
- return out_shape;
-}
-} // namespace
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const uint32_t argminmax_axis, ArgOperation op)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
- DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
- "Inputs are not broadcast compatible");
-
- const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
- "output shape's size does not match argminmax_axis");
-
- const auto num_dimensions = input->tensor_shape().num_dimensions();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- argminmax_axis >= 0 && argminmax_axis < num_dimensions,
- "argminmax_axis must be greater than or equal to 0 and less than (input's rank).");
- return Status{};
-}
-
-} // namespace
-
-CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {}
-
-void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
- const uint32_t argminmax_axis, ArgOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis));
-
- _input = input;
- _output = output;
- _argminmax_axis = argminmax_axis;
-
- std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
- output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis));
-
- // Construct kernel name for argmax and argmin based on axis
- std::string kernel_name = "arg_op";
- int op_code = 0;
- if (op == ArgOperation::MAX)
- {
- op_code = 1;
- }
- else if (op == ArgOperation::MIN)
- {
- op_code = 2;
- }
- else
- throw std::runtime_error("Operation not supported, yet");
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
- build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output_info, Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output_info->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const uint32_t argminmax_axis, ArgOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op));
-
- return Status{};
-}
-
-void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &shape_in = _input->info()->tensor_shape();
-
- unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
- _kernel.setArg<cl_int>(idx++, _argminmax_axis);
- _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]);
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup input slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- // Copy output's shape in order to use for recovering at end of this method
- const TensorShape shape_out = _output->info()->tensor_shape();
- _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-
- // Recover output's shape of output tensor
- _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
deleted file mode 100644
index 1c505b4d5..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output, ConvertPolicy policy)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
- DataType::F16, DataType::F32);
-
- const TensorShape &out_shape =
- TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- output->data_type() == DataType::U8 &&
- (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
- "Output can only be U8 if both inputs are U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
- ITensorInfo *output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output, out_shape);
-
- if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output, Format::S16);
- }
- else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output, Format::F16);
- }
- else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output, Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
- ICLTensor *output, ConvertPolicy policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input1->info(), input2->info(), output->info(), policy));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- const bool has_float_out = is_data_type_float(output->info()->data_type());
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
- build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts));
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1,
- const ITensorInfo *input2,
- const ITensorInfo *output, ConvertPolicy policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
- input2->clone().get(),
- output->clone().get())
- .first);
-
- return Status{};
-}
-
-void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
-
- const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
-
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLArithmeticSubtractionExKernel::border_size() const
-{
- const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
deleted file mode 100644
index b0016d23c..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const int32_t *block_size)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1,
- "Block size should be greater than or equal to 1.");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2),
- "Input Depth should be equal to Output Depth");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3),
- "Input batch should be equal to (output batch * block size[0] *block size[1])");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) &&
- !(output->dimension(1) % block_size[0]),
- "Output height and width should be divisible by block size[0] "
- "and block_size[1] respectively");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) &&
- (output->dimension(1) == input->dimension(1) * block_size[0]),
- "Output height and width should be equal to "
- "input_height*blocksize[0] and input_width*blocksize[1] "
- "respectively");
-
- return Status{};
-}
-
-} // namespace
-
-CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output,
- const int32_t *block_size)
-{
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
- _input = input;
- _output = output;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0]));
- build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1]));
- build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3)));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_out(slice_in);
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_out.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_out);
- add_4D_tensor_argument(idx, _output, slice_in);
- enqueue(queue, *this, slice_in);
- } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
deleted file mode 100644
index 3d2f2c702..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output)
-{
- const TensorShape &out_shape =
- TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
- DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
- return Status{};
-}
-} // namespace
-
-CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
- ICLTensor *output, BinaryLogicalOperation op)
-{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Create kernel
- std::string kernel_name = "binary_logical_op";
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
-
- int op_code = 0;
- switch (op)
- {
- case BinaryLogicalOperation::AND:
- op_code = 1;
- break;
- case BinaryLogicalOperation::OR:
- op_code = 2;
- break;
- default:
- throw std::runtime_error("Operation not supported, yet");
- }
-
- build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
- AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
-
- const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLBinaryLogicalOpKernel::border_size() const
-{
- const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
deleted file mode 100644
index bf7ebae3f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
- _input = input;
- _output = output;
-
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- // Create kernel
- if (is_data_type_quantized_asymmetric(input->info()->data_type()))
- {
- const float scale_in = input->info()->quantization_info().scale;
- const int offset_in = input->info()->quantization_info().offset;
- build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
- build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
- }
- else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
- {
- const float scale_in = output->info()->quantization_info().scale;
- const int offset_in = output->info()->quantization_info().offset;
- build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
- build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
- }
- else
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts));
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
deleted file mode 100644
index 5af5b16ea..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output)
-{
- const TensorShape &out_shape =
- TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16,
- DataType::S16, DataType::F16, DataType::S32,
- DataType::F32, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16,
- DataType::S16, DataType::F16, DataType::S32,
- DataType::F32, DataType::QASYMM8);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
- return Status{};
-}
-} // namespace
-
-CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
- ICLTensor *output, const ComparisonOperation &op)
-{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Create kernel
- std::string kernel_name = "comparison_op";
- int op_code = 0;
-
- switch (op)
- {
- case ComparisonOperation::EQUAL:
- op_code = 1;
- break;
- case ComparisonOperation::NOT_EQUAL:
- op_code = 2;
- break;
- default:
- throw std::runtime_error(" Operation not supported, yet");
- }
-
- std::set<std::string> build_opts;
- build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
- build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type())));
- build_opts.emplace(
- ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- if (is_data_type_quantized_asymmetric(input1->info()->data_type()) &&
- ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) ||
- (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale)))
- {
- build_opts.emplace("-DOFFSET_IN1=" +
- support::cpp11::to_string(input1->info()->quantization_info().offset));
- build_opts.emplace("-DOFFSET_IN2=" +
- support::cpp11::to_string(input2->info()->quantization_info().offset));
- build_opts.emplace("-DSCALE_IN1=" +
- support::cpp11::to_string(input1->info()->quantization_info().scale));
- build_opts.emplace("-DSCALE_IN2=" +
- support::cpp11::to_string(input2->info()->quantization_info().scale));
- kernel_name += "_qasymm8";
- }
-
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output->info(), out_shape);
-
- if (input1->info()->data_type() == DataType::S16 ||
- input2->info()->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output->info(), Format::S16);
- }
- else if (input1->info()->data_type() == DataType::F16 &&
- input2->info()->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output->info(), Format::F16);
- }
- else if (input1->info()->data_type() == DataType::F32 ||
- input2->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
- AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
-
- const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLComparisonOpKernel::border_size() const
-{
- const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
deleted file mode 100644
index c386e3312..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const int32_t block_size)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
- "Block size should be greater than or equal to 1.");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size,
- "Output width should be equal to (Input width * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size,
- "Output height should be equal to (Input height * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0,
- "Input depth should be divisible by (block size * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- output->dimension(2) == input->dimension(2) / (block_size * block_size),
- "Output depth should be equal to (Input depth / (block size * block size))");
-
- return Status{};
-}
-} // namespace
-
-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
-{
- // DO NOTHING
-}
-
-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
- const int32_t block_size)
-{
-
- _input = input;
- _output = output;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup input slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
deleted file mode 100644
index 0862b78bf..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- input_access.set_valid_region(win, output->valid_region());
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
- : _input(nullptr), _output(nullptr), _lookups(nullptr)
-{
-}
-
-Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *lookups)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
- ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-
- return Status{};
-}
-
-void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
- const ICLTensor *lookups)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
-
- _input = input;
- _output = output;
- _lookups = lookups;
-
- // Set kernel build options
- std::stringstream kernel_name;
- std::set<std::string> build_opts;
- kernel_name << "embedding_lookup";
-
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- Window win_lookup;
- win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_in);
- add_1D_tensor_argument(idx, _lookups, win_lookup);
-
- enqueue(queue, *this, slice_in);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
deleted file mode 100644
index b1ee21bdc..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLExpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Auto initialize output
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(),
- input->info()->quantization_info());
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- _input = input;
- _output = output;
-
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- // Create kernel
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLExpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
deleted file mode 100644
index ae2801e2b..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-
- return Status{};
-}
-
-} // namespace
-
-CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {}
-
-void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Construct kernel name
- std::string kernel_name = "gather";
- if (input1->info()->num_dimensions() == 1)
- {
- kernel_name = "gather_1d";
- }
- else if (input1->info()->num_dimensions() == 2)
- {
- if (_output->info()->num_dimensions() == 1)
- {
- kernel_name = "gather_1d_out";
- }
- }
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
-
- return Status{};
-}
-
-void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- if (_input1->info()->num_dimensions() == 1)
- {
- Window slice = window.first_slice_window_1D();
-
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input1, slice);
- add_1D_tensor_argument(idx, _input2, slice);
- add_1D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- }
- else if (_input1->info()->num_dimensions() == 2)
- {
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
- Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
-
- // Set inputs
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, window_collapsed);
- add_1D_tensor_argument(idx, _input2, slice);
- if (_output->info()->num_dimensions() == 1)
- {
- add_1D_tensor_argument(idx, _output, slice);
- }
- else
- {
- add_2D_tensor_argument(idx, _output, window_collapsed);
- }
- enqueue(queue, *this, slice);
- }
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
deleted file mode 100644
index cd7b21c6d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- input_access.set_valid_region(win, output->valid_region());
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLHashtableLookupKernel::CLHashtableLookupKernel()
- : _input(nullptr), _output(nullptr), _lookups(nullptr)
-{
-}
-
-Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
- const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *hits)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
- "Output's shape was not set");
-
- ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) &&
- output->dimension(output->num_dimensions() - 1) == lookups->dimension(0));
- ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
- ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
- ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
- ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
-
- return Status{};
-}
-
-void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
- const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
-
- _lookups = lookups;
- _keys = keys;
- _input = input;
- _output = output;
- _hits = hits;
-
- // Make _lookup_indices tensor
- _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
- _lookup_indices->allocator()->init(
- TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
- _lookup_indices->allocator()->allocate();
-
- // Set kernel build options
- std::stringstream kernel_name;
- std::set<std::string> build_opts;
- kernel_name << "hashtable_lookup";
-
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-}
-
-void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const_cast<ICLTensor *>(_lookups)->map(queue);
- const_cast<ICLTensor *>(_keys)->map(queue);
- _hits->map(queue);
- _lookup_indices->map(queue);
-
- // Set values of hits
- const int32_t *lookups_buf =
- reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
- const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
- uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
- int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
-
- std::map<int32_t, size_t> key_map;
- const size_t keys_num = _keys->info()->dimension(0);
- for (size_t key_index = 0; key_index < keys_num; key_index++)
- {
- key_map[keys_buf[key_index]] = key_index;
- }
-
- const size_t lookups_num = _lookups->info()->dimension(0);
- for (size_t i = 0; i < lookups_num; ++i)
- {
- const auto lookup_value = lookups_buf[i];
- const auto it = key_map.find(lookup_value);
- if (it != key_map.end())
- {
-#if defined(DEBUG)
- if (it->second >= lookups_num)
- ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
-#endif // defined(DEBUG)
- lookup_indices_buf[i] = static_cast<int32_t>(it->second);
- hits_buf[i] = static_cast<uint8_t>(1);
- }
- else
- {
- lookup_indices_buf[i] = -1;
- hits_buf[i] = static_cast<uint8_t>(0);
- }
- }
-
- const_cast<ICLTensor *>(_lookups)->unmap(queue);
- const_cast<ICLTensor *>(_keys)->unmap(queue);
- _hits->unmap(queue);
- _lookup_indices->unmap(queue);
-
- Window win = window.collapse(ICLKernel::window(), 2, 4);
-
- Window win_lookup;
- win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, win);
- add_4D_tensor_argument(idx, _output, win);
- add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
-
- enqueue(queue, *this, win);
- } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
deleted file mode 100644
index 80d99dd3b..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLNegKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(),
- output->info()->tensor_shape());
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- return Status{};
-}
-
-} // namespace
-
-CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- // Create kernel
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
-
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
deleted file mode 100644
index 12bbe910f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- NormalizationLayerInfo norm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
- // Checks performed when output is configured
- if (output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
- NormalizationLayerInfo norm_info)
-{
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, *input->clone());
-
- const unsigned int norm_size = norm_info.norm_size();
- bool is_in_map = norm_info.is_in_map();
-
- const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
- const BorderSize border_size = BorderSize(0, border_width);
-
- const unsigned int num_elems_processed_per_iteration = 4;
- const unsigned int num_elems_read_per_iteration =
- is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2))
- : num_elems_processed_per_iteration;
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside
- // the kernel, avoiding padding
- AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLNormalizationLayerExKernel::CLNormalizationLayerExKernel()
- : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
-{
-}
-
-BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; }
-
-void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output,
- NormalizationLayerInfo norm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), *input->info()->clone());
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
-
- _input = input;
- _output = output;
-
- const unsigned int num_elems_processed_per_iteration = 4;
- const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.add_option(
- ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
- build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
- build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
- build_opts.add_option(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
- build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size())));
- build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
- build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-
- // Create kernel
- std::string kernel_name =
- _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "normalization_layer_";
- _config_id += lower_string(string_from_data_type(input->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(
- static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(norm_info.norm_size());
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- NormalizationLayerInfo norm_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(
- validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
-
- return Status{};
-}
-
-void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
- Window slice = window_collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- } while (window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
deleted file mode 100644
index 241f8ae4d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
- const TensorShape &out_shape =
- TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
- DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
- DataType::QASYMM8);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
- return Status{};
-}
-} // namespace
-
-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info()));
-
- _input = input;
- _alpha = alpha;
- _output = output;
-
- // Create kernel
- std::string kernel_name = "prelu";
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- if (is_data_type_quantized_asymmetric(input->info()->data_type()))
- {
- build_opts.emplace("-DOFF_IN1=" +
- support::cpp11::to_string(input->info()->quantization_info().offset));
- build_opts.emplace("-DOFF_IN2=" +
- support::cpp11::to_string(alpha->info()->quantization_info().offset));
- build_opts.emplace("-DOFF_OUT=" +
- support::cpp11::to_string(output->info()->quantization_info().offset));
- build_opts.emplace("-DSCALE_IN1=" +
- support::cpp11::to_string(input->info()->quantization_info().scale));
- build_opts.emplace("-DSCALE_IN2=" +
- support::cpp11::to_string(alpha->info()->quantization_info().scale));
- build_opts.emplace("-DSCALE_OUT=" +
- support::cpp11::to_string(output->info()->quantization_info().scale));
- kernel_name += "_qasymm8";
- }
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output->info(), out_shape);
-
- if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output->info(), Format::F16);
- }
- else if (input->info()->data_type() == DataType::F32 ||
- alpha->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
- Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
-
- AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input->info()->tensor_shape();
- const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
-
- const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_input1);
- add_3D_tensor_argument(idx, _alpha, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPReLUKernel::border_size() const
-{
- const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
- const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
deleted file mode 100644
index 99b54c822..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
- const ITensorInfo *pad_size_info)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 &&
- input_info->num_dimensions() <= 4,
- "Pad kernel supports upto 4-D input tensor");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- input_info->num_dimensions() == output_info->num_dimensions(),
- "output tensor should have same number of dimensions as input tensor");
-
- if (input_info->data_type() == DataType::QASYMM8)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
- output_info->quantization_info(),
- "The input and output quantization info are different!");
- }
-
- return Status{};
-}
-
-} // namespace
-
-CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
-
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
-
- _input = input;
- _output = output;
- _pad_size = pad_size;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
- build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
- if (input->info()->data_type() == DataType::QASYMM8)
- {
- build_opts.emplace("-DZERO_VALUE=" +
- support::cpp11::to_string(input->info()->quantization_info().offset));
- }
- else
- {
- build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
- }
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- _pad_size->map(queue);
-
- // Padding values only for up, top, left and front are required based on the rank of tensor
- int rank = _pad_size->info()->dimension(1);
-
- auto pad_batch_up =
- (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0;
- auto pad_height_top =
- (rank >= 2)
- ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1}))
- : 0;
- auto pad_width_left = (rank >= 1)
- ? *reinterpret_cast<const int32_t *>(
- _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1}))
- : 0;
- auto pad_depth_front =
- (rank >= 3)
- ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3}))
- : 0;
-
- _pad_size->unmap(queue);
-
- // Pad_values which needs to be passed
- const cl_int4 paddingValues = {
- {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
- static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- _kernel.setArg<cl_int4>(idx++, paddingValues);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
deleted file mode 100644
index aa094761c..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-using namespace arm_compute;
-
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
-{
- TensorShape output_shape = input->tensor_shape();
- permute(output_shape, perm);
- return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const PermutationVector &perm)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-
- const TensorShape output_shape =
- misc::shape_calculator::compute_permutation_output_shape(*input, perm);
-
- // Validate configured output
- if (output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
- return Status{};
-}
-} // namespace
-
-CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {}
-
-void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output,
- const PermutationVector &perm)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
-
- _input = input;
- _output = output;
- _perm = perm;
-
- const TensorShape output_shape = get_output_shape(input->info(), perm);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
- // Create kernel
- std::set<std::string> build_opts;
-
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
- // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector
- build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0]));
- build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1]));
- build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2]));
- build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3]));
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const PermutationVector &perm)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
-
- return Status{};
-}
-
-void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_out(slice_in);
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_out.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_in);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
deleted file mode 100644
index b985aa737..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
- RoundingPolicy rounding_policy)
-{
- ARM_COMPUTE_UNUSED(overflow_policy);
- ARM_COMPUTE_UNUSED(rounding_policy);
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
-
- const TensorShape &out_shape =
- TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- output->data_type() == DataType::U8 &&
- (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
- "Output can only be U8 if both inputs are U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
- ITensorInfo *output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output, out_shape);
-
- if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output, Format::S16);
- }
- else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output, Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
- ICLTensor *output, float scale,
- ConvertPolicy overflow_policy,
- RoundingPolicy rounding_policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
- scale, overflow_policy, rounding_policy));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- int scale_int = -1;
- // Extract sign, exponent and mantissa
- int exponent = 0;
- float normalized_mantissa = std::frexp(scale, &exponent);
- // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
- // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <=
- // 14
- // Moreover, it will be negative as we deal with 1/2^n
- if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
- {
- // Store the positive exponent. We know that we compute 1/2^n
- // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
- scale_int = std::abs(exponent - 1);
- }
-
- std::string data_type;
- std::string compute_type;
- // Check if it has float inputs and output
- if (is_data_type_float(input1->info()->data_type()) ||
- is_data_type_float(input2->info()->data_type()))
- {
- scale_int = -1;
- compute_type = (input1->info()->data_type() == DataType::F32 ||
- input2->info()->data_type() == DataType::F32)
- ? "float"
- : "half";
- data_type = "DATA_TYPE_FLOAT";
- }
- else
- {
- if (input1->info()->data_type() == DataType::S16 ||
- input2->info()->data_type() == DataType::S16)
- {
- compute_type = "int";
- }
- else
- {
- compute_type = "ushort";
- }
- data_type = "DATA_TYPE_INT";
- }
-
- // Construct kernel name
- std::string kernel_name = "pixelwise_div";
- kernel_name += (scale_int >= 0) ? "_int" : "_float";
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace(
- (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()))
- ? "-DWRAP"
- : "-DSATURATE");
- build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
- : "-DROUND=_rte");
- build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
- build_opts.emplace("-D" + data_type);
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- // Set scale argument
- unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
-
- if (scale_int >= 0)
- {
- _kernel.setArg(idx++, scale_int);
- }
- else
- {
- _kernel.setArg(idx++, scale);
- }
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy,
- RoundingPolicy rounding_policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(
- validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
- input2->clone().get(),
- output->clone().get())
- .first);
-
- return Status{};
-}
-
-void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
-
- const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPixelWiseDivisionKernel::border_size() const
-{
- const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
deleted file mode 100644
index f581780e1..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-namespace
-{
-// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
-// are the same.
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
-{
- TensorShape out_shape{input_shape};
-
- out_shape.set(axis, 1);
-
- return out_shape;
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
- ReduceOperation op)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
- if (output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
- DataType::F32, DataType::S32);
- if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
- "Not support QASYMM8, yet");
- }
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
- "Inputs are not broadcast compatible");
-
- const auto num_dimensions = input->tensor_shape().num_dimensions();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- axis >= 0 && axis < num_dimensions,
- "axis must be greater than or equal to 0 and less than (input's rank).");
-
- const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
- "output shape's size does not match axis");
-
- return Status{};
-}
-} // namespace
-
-CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
-
-void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
- const uint32_t axis, ReduceOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
- _input = input;
- _output = output;
- _axis = axis;
-
- std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
- output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
-
- // Construct kernel name
- std::string kernel_name;
- int op_code = 0;
- if (op == ReduceOperation::MAX)
- {
- kernel_name = "reduce_min_max";
- op_code = 1;
- }
- else if (op == ReduceOperation::MIN)
- {
- kernel_name = "reduce_min_max";
- op_code = 2;
- }
- else if (op == ReduceOperation::SUM)
- {
- kernel_name = "reduce_sum_mean";
- op_code = 3;
- }
- else if (op == ReduceOperation::MEAN)
- {
- kernel_name = "reduce_sum_mean";
- op_code = 4;
- }
- else
- throw std::runtime_error("Operation not supported, yet");
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
- build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output_info, Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output_info->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const uint32_t axis, ReduceOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-
- return Status{};
-}
-
-void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &shape_in = _input->info()->tensor_shape();
-
- unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
- _kernel.setArg<cl_int>(idx++, _axis);
- _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
-
- // Support dimensions up to 4
- Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
-
- // Setup input slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- // Copy output's shape in order to use for recovering at end of this method
- // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
- // of input and output are the same
- const TensorShape shape_out = _output->info()->tensor_shape();
- _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
-
- idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
-
- // Recover output's shape of output tensor
- _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
deleted file mode 100644
index 6b0697e89..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
- const ITensorInfo *padding_size, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::F16, DataType::S32,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::F16, DataType::S32,
- DataType::F32);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
- "The number of dimensions of input should be equal to output");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
- "The input and output layouts are different!");
-
- // TODO Support other cases
- if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
- "Input Depth should be equal to Output Depth");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
- padding_size->dimension(1) != 2,
- "Only 2-dimensional spatial block's size was wrong");
- }
- else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
- "Input Depth should be equal to Output Depth");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
- padding_size->dimension(1) != 2,
- "Only 2-dimensional spatial block's size was wrong");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
- "CLSpaceToBatchNDKernel supports dimensions up to 4");
-
- if (input->data_type() == DataType::QASYMM8)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
- "The input and output quantization info are different!");
- }
-
- return Status{};
-}
-
-} // namespace
-
-CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
- const ICLTensor *padding_size, ICLTensor *output)
-{
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
-
- _input = input;
- _block_size = block_size;
- _padding_size = padding_size;
- _output = output;
-
- // Set kernel build options
- // TODO Support other cases
- std::string kernel_name = "space_to_batch_4d";
- std::set<std::string> build_opts;
- Window win;
-
- if (input->info()->data_layout() == DataLayout::NCHW)
- {
- kernel_name += "_nchw";
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
-
- win = calculate_max_window(*output->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
- }
- else if (input->info()->data_layout() == DataLayout::NHWC)
- {
- kernel_name += "_nhwc";
- build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
- build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.emplace("-DVEC_SIZE=" +
- support::cpp11::to_string(num_elems_processed_per_iteration));
-
- win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- input_access.set_valid_region(win, output->info()->valid_region());
-
- if (window_changed)
- {
- ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
- }
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported layout");
- }
-
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
- if (input->info()->data_type() == DataType::QASYMM8)
- {
- build_opts.emplace("-DZERO_VALUE=" +
- support::cpp11::to_string(input->info()->quantization_info().offset));
- }
- else
- {
- build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
- }
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- // Configure kernel window
- ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-#if defined(DEBUG)
- const_cast<ICLTensor *>(_block_size)->map(queue);
- const_cast<ICLTensor *>(_padding_size)->map(queue);
-
- const size_t num_dimensions = _input->info()->num_dimensions();
- const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
- int32_t batch_size = _input->info()->dimension(num_dimensions - 1);
- for (size_t i = 0; i < num_spacial_dimensions; ++i)
- {
- const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
- const int32_t padding_size_pre =
- *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
- const int32_t padding_size_post =
- *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
-
- ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
- ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
- "Padding size should be greater than or equal to 0");
-
- if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
- {
- ARM_COMPUTE_ERROR_ON_MSG(
- _output->info()->dimension(i) !=
- (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
- "Dimension value of spatial block does not match output's dimension value");
- }
- else
- {
- ARM_COMPUTE_ERROR_ON_MSG(
- _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
- (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
- padding_size_pre + padding_size_post) /
- block_size,
- "Dimension value of spatial block does not match output's dimension value");
- }
-
- batch_size *= block_size;
- }
- ARM_COMPUTE_ERROR_ON_MSG(
- _output->info()->dimension(num_dimensions - 1) != batch_size,
- "Output batch size should be equal to input batch size * (multiplication of all block size)");
-
- const_cast<ICLTensor *>(_block_size)->unmap(queue);
- const_cast<ICLTensor *>(_padding_size)->unmap(queue);
-#endif // defined(DEBUG)
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- // Set block size window
- Window win_block = calculate_max_window(*_block_size->info(), Steps());
-
- // Set padding size window
- Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- add_1D_tensor_argument(idx, _block_size, win_block);
- add_2D_tensor_argument(idx, _padding_size, win_padding);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
deleted file mode 100644
index 5d6329edc..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const int32_t block_size)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
- "Block size should be greater than or equal to 1.");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3),
- "Input batch should be equal to Output batch");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- input->dimension(2) * block_size * block_size == output->dimension(2),
- "Output depth should be equal to (input depth * block size *block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) &&
- !(input->dimension(1) % block_size),
- "Input height and width should be divisible by block size");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) &&
- (output->dimension(1) == (input->dimension(1) / block_size)),
- "Output height and width should be equal to "
- "input_height/blocksize and input_width/blocksize respectively");
-
- return Status{};
-}
-
-} // namespace
-
-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
- const int32_t block_size)
-{
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
- _input = input;
- _output = output;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
- build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_out(slice_in);
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_out.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_in);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
deleted file mode 100644
index 260bc39f1..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
- const TensorShape &out_shape =
- TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
- return Status{};
-}
-} // namespace
-
-CLSquaredDifferenceKernel::CLSquaredDifferenceKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
- ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info()));
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Create kernel
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts));
-
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output->info(), out_shape);
-
- if (input1->info()->data_type() == DataType::F16 &&
- input2->info()->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output->info(), Format::F16);
- }
- else if (input1->info()->data_type() == DataType::F32 ||
- input2->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
- AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
-
- const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLSquaredDifferenceKernel::border_size() const
-{
- const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
deleted file mode 100644
index 48146a43a..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-
-using namespace arm_compute;
-
-CLStridedSliceExKernel::CLStridedSliceExKernel()
- : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
- _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
-{
-}
-
-Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *begin, const ITensorInfo *end,
- const ITensorInfo *strides, int32_t beginMask,
- int32_t endMask, int32_t shrinkAxisMask)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(),
- strides->tensor_shape());
-
- return Status{};
-}
-
-// Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axisSize - 1] that can be used to index
-// directly into the data.
-inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride,
- const TensorShape &inputShape, int32_t axis)
-{
- // Begin with the specified index
- int32_t start = begin;
-
- // beginMask override
- if (beginMask & 1 << axis)
- {
- if (stride > 0)
- {
- // Forward iteration - use the first element. These values will get
- // clamped below (Note: We could have set them to 0 and axisSize-1, but
- // use lowest() and max() to maintain symmetry with StopForAxis())
- start = std::numeric_limits<int32_t>::lowest();
- }
- else
- {
- // Backward iteration - use the last element.
- start = std::numeric_limits<int32_t>::max();
- }
- }
-
- // Handle negative indices
- int32_t axisSize = inputShape[axis];
- if (start < 0)
- {
- start += axisSize;
- }
-
- // Clamping
- start = arm_compute::utility::clamp(start, 0, axisSize - 1);
-
- return start;
-}
-
-// Return the "real" index for the end of iteration along that axis. This is an
-// "end" in the traditional C sense, in that it points to one past the last
-// element. ie. So if you were iterating through all elements of a 1D array of
-// size 4, this function would return 4 as the stop, because it is one past the
-// "real" indices of 0, 1, 2 & 3.
-inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
- const TensorShape &inputShape, int32_t axis)
-{
- // Begin with the specified index
- int32_t stop = end;
-
- // endMask override
- if (endMask & (1 << axis))
- {
- if (stride > 0)
- {
- // Forward iteration - use the last element. These values will get
- // clamped below
- stop = std::numeric_limits<int32_t>::max();
- }
- else
- {
- // Backward iteration - use the first element.
- stop = std::numeric_limits<int32_t>::lowest();
- }
- }
-
- // Handle negative indices
- int32_t axisSize = inputShape[axis];
- if (stop < 0)
- {
- stop += axisSize;
- }
-
- // Clamping
- // Because the end index points one past the last element, we need slightly
- // different clamping ranges depending on the direction.
- if (stride > 0)
- {
- // Forward iteration
- stop = arm_compute::utility::clamp(stop, 0, axisSize);
- }
- else
- {
- // Backward iteration
- stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
- }
-
- return stop;
-}
-
-inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
-{
- int32_t ret = 0;
- if (stride > 0)
- {
- ret = ((stop - start - 1) / stride) + 1;
- }
- else
- {
- ret = ((stop - start + 1) / stride) + 1;
- }
- ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number");
- return ret;
-}
-
-void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output,
- ICLTensor *beginData, ICLTensor *endData,
- ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
- int32_t shrinkAxisMask)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
- endData->info(), stridesData->info(), beginMask, endMask,
- shrinkAxisMask));
-
- _input = input;
- _output = output;
- _beginData = beginData;
- _endData = endData;
- _stridesData = stridesData;
- _beginMask = beginMask;
- _endMask = endMask;
- _shrinkAxisMask = shrinkAxisMask;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DELEMENT_DATA_TYPE=" +
- get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
- ICLKernel::configure_internal(win);
-}
-
-void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- _beginData->map(queue);
- _endData->map(queue);
- _stridesData->map(queue);
-
- std::vector<int32_t> starts;
- std::vector<int32_t> strides;
-
- for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
- {
- const TensorShape shape = _input->info()->tensor_shape();
- starts.emplace_back(
- StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
- reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
-
- strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
- }
-
- for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
- {
- starts.emplace_back(0);
- strides.emplace_back(1);
- }
- // TODO: Apply shrinkAxisMask
-
- _beginData->unmap(queue);
- _stridesData->unmap(queue);
- _endData->unmap(queue);
-
- unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
- const cl_int4 startsArg = {{
- static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
- static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
- }};
- _kernel.setArg<cl_int4>(idx++, startsArg);
-
- const cl_int4 stridesArg = {{
- static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]),
- static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]),
- }};
- _kernel.setArg<cl_int4>(idx++, stridesArg);
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
deleted file mode 100644
index 073c2f7bb..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-namespace arm_compute
-{
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
-
-void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
- cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
-{
- ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
- ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
- ARM_COMPUTE_ERROR_ON(n == 0);
-
- _input = input;
- _topk_values = topk_values;
- _topk_indices = topk_indices;
-
- // Set kernel build options
- std::set<std::string> build_opts;
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
-
- unsigned int idx = 3 * num_arguments_per_1D_tensor();
- _kernel.setArg(idx++, *indices);
- _kernel.setArg(idx++, *temp_stack);
- _kernel.setArg<cl_int>(idx++, k);
- _kernel.setArg<cl_int>(idx++, n);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, 1, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, window);
- add_1D_tensor_argument(idx, _topk_values, window);
- add_1D_tensor_argument(idx, _topk_indices, window);
-
- enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
-
-void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
- int n)
-{
- ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
- ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
- ARM_COMPUTE_ERROR_ON(n == 0);
-
- _input = input;
-
- // Set kernel build options
- std::set<std::string> build_opts;
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
-
- unsigned int idx = num_arguments_per_1D_tensor();
- _kernel.setArg(idx++, *in_key_buf);
- _kernel.setArg(idx++, *in_ind_buf);
- _kernel.setArg<cl_int>(idx++, n);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, window);
-
- enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// This kernel makes a histogram of radix for each work item.
-CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
-
-void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
-{
- ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
-
- unsigned int radix = 1 << bits;
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
- build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
- build_opts.emplace("-DPERMUT=1");
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
-
- int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
-
- unsigned int idx = 1;
- _kernel.setArg(idx++, *hist_buf);
-
- idx = 3;
- _kernel.setArg(idx++, loc_histo_size, nullptr);
- _kernel.setArg<cl_int>(idx++, n);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- _kernel.setArg(0, *_in_key_buf);
- _kernel.setArg<cl_int>(2, _pass);
-
- cl::NDRange lws = cl::NDRange(_ITEMS, 1);
-
- enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
-
-void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
-{
- ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
-
- unsigned int radix = 1 << bits;
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
- build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
- build_opts.emplace("-DPERMUT=1");
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
-
- int temp_size =
- std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
-
- unsigned int idx = 0;
- _kernel.setArg(idx++, *hist_buf);
- _kernel.setArg(idx++, temp_size, nullptr);
- _kernel.setArg(idx++, *glob_sum_buf);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
- cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
-
- enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
-
-void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
- int bits)
-{
- ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
-
- unsigned int radix = 1 << bits;
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
- build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
- build_opts.emplace("-DPERMUT=1");
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
-
- int temp_size =
- std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
-
- unsigned int idx = 0;
- _kernel.setArg(idx++, *glob_sum_buf);
- _kernel.setArg(idx++, temp_size, nullptr);
- _kernel.setArg(idx++, *temp_buf);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
- cl::NDRange lws = cl::NDRange(gws_x, 1);
-
- enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
-
-void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
-{
- ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
-
- unsigned int radix = 1 << bits;
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
- build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
- build_opts.emplace("-DPERMUT=1");
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
-
- unsigned int idx = 0;
- _kernel.setArg(idx++, *hist_buf);
- _kernel.setArg(idx++, *glob_sum_buf);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
- cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
-
- enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortReorder::CLRadixSortReorder()
- : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
- _out_ind_buf(nullptr)
-{
-}
-
-void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
-{
- ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
- ARM_COMPUTE_ERROR_ON(n == 0);
-
- unsigned int radix = 1 << bits;
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
- build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
- build_opts.emplace("-DPERMUT=1");
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
-
- unsigned int idx = 2;
- _kernel.setArg(idx++, *hist_buf);
-
- idx = 6;
- _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
- _kernel.setArg<cl_int>(idx++, n);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
- unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
- cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
-
- _kernel.setArg(0, *_in_key_buf);
- _kernel.setArg(1, *_out_key_buf);
- _kernel.setArg<cl_int>(3, _pass);
- _kernel.setArg(4, *_in_ind_buf);
- _kernel.setArg(5, *_out_ind_buf);
-
- enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
-
-void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
-{
- ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
- ARM_COMPUTE_ERROR_ON(n == 0);
-
- // Set kernel build options
- std::set<std::string> build_opts;
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
-
- unsigned int idx = 1;
- _kernel.setArg(idx++, *first_negative_idx_buf);
- _kernel.setArg<cl_int>(idx++, n);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- unsigned int idx = 0;
- _kernel.setArg(idx++, *_out_key_buf);
-
- enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
- : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
-{
-}
-
-void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
-{
- ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
- ARM_COMPUTE_ERROR_ON(n == 0);
-
- // Set kernel build options
- std::set<std::string> build_opts;
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
-
- unsigned int idx = 4;
- _kernel.setArg(idx++, *first_negative_idx_buf);
- _kernel.setArg<cl_int>(idx++, n);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- unsigned int idx = 0;
- _kernel.setArg(idx++, *_in_key_buf);
- _kernel.setArg(idx++, *_out_key_buf);
- _kernel.setArg(idx++, *_in_ind_buf);
- _kernel.setArg(idx++, *_out_ind_buf);
-
- enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Store::CLTopKV2Store()
- : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
-{
-}
-
-void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
-{
- ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
- ARM_COMPUTE_ERROR_ON(k == 0);
- ARM_COMPUTE_ERROR_ON(k > n);
-
- _values = values;
- _indices = indices;
-
- // Set kernel build options
- std::set<std::string> build_opts;
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
-
- unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
- _kernel.setArg<cl_int>(idx++, n);
-
- // Configure kernel window
- Window win;
- win.set(0, Window::Dimension(0, k, 1));
- ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
-{
- _out_key_buf = out_key_buf;
- _out_ind_buf = out_ind_buf;
-}
-
-void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _values, window);
- add_1D_tensor_argument(idx, _indices, window);
- _kernel.setArg(idx++, *_out_key_buf);
- _kernel.setArg(idx++, *_out_ind_buf);
-
- enqueue(queue, *this, window);
-}
-
-} // namespace arm_compute