diff options
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/kernels')
24 files changed, 0 insertions, 4214 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp deleted file mode 100644 index 1fdd2f98f..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/UtilsEx.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfoEx &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::F16, DataType::F32); - - // Checks performed when output is configured - if ((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - if (output != nullptr) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, *input); - } - - const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - bool window_changed = false; - - if (output != nullptr) - { - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->valid_region()); - } - else - { - window_changed = update_window_and_padding( - win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); - } - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLActivationLayerExKernel::CLActivationLayerExKernel() - : _input(nullptr), _output(nullptr), _run_in_place(false) -{ -} - -void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output, - ActivationLayerInfoEx act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input); - - _run_in_place = (output == nullptr) || (output == input); - - if (output != nullptr) - { - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); - } - - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info)); - - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); - const DataType dt = input->info()->data_type(); - float a_const = act_info.a(); - float b_const = act_info.b(); - int a_const_int = 0; - int b_const_int = 0; - - // Create quantized version of constants a, b if needed - if (is_data_type_quantized(dt)) - { - a_const_int = - input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP); - b_const_int = - input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP); - } - - // Set build options - std::set<std::string> build_opts; - build_opts.emplace( - ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation())))); - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - if (is_data_type_quantized(dt)) - { - build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int))); - build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int))); - - const int o1 = input->info()->quantization_info().offset; - // Quantized value of 0 corresponds to the offset o1 - build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1))); - - // Set scale and offset of the input and output if they have different quantization info - if (is_data_type_quantized_asymmetric(dt) && output != nullptr) - { - const float s1 = input->info()->quantization_info().scale; - const float s2 = output->info()->quantization_info().scale; - const int o2 = output->info()->quantization_info().offset; - - if (o1 != o2 || s1 != s2) - { - build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1))); - build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2))); - build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1))); - build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2))); - } - } - } - else - { - build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); - build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); - } - - build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : ""); - - // Create kernel - std::string kernel_name = std::string("activation_layer_ex"); - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Make sure _kernel is initialized before calling the parent's configure - _input = input; - _output = output; - - // Configure kernel window - auto win_config = - validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Set config_id for enabling LWS tuning - _config_id = "activation_layer_ex_"; - _config_id += lower_string(string_from_data_type(dt)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfoEx &act_info) -{ - const bool run_in_place = (output == nullptr) || (output == input); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get()) - .first); - - return Status{}; -} - -void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - if (!_run_in_place) - { - add_3D_tensor_argument(idx, _output, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp deleted file mode 100644 index c1a2ad0be..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis) -{ - TensorShape out_shape{input_shape}; - - out_shape.set(argminmax_axis, 1); - - return out_shape; -} -} // namespace - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t argminmax_axis, ArgOperation op) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32, - DataType::U8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Inputs are not broadcast compatible"); - - const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), - "output shape's size does not match argminmax_axis"); - - const auto num_dimensions = input->tensor_shape().num_dimensions(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - argminmax_axis >= 0 && argminmax_axis < num_dimensions, - "argminmax_axis must be greater than or equal to 0 and less than (input's rank)."); - return Status{}; -} - -} // namespace - -CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {} - -void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output, - const uint32_t argminmax_axis, ArgOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis)); - - _input = input; - _output = output; - _argminmax_axis = argminmax_axis; - - std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); - output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis)); - - // Construct kernel name for argmax and argmin based on axis - std::string kernel_name = "arg_op"; - int op_code = 0; - if (op == ArgOperation::MAX) - { - op_code = 1; - } - else if (op == ArgOperation::MIN) - { - op_code = 2; - } - else - throw std::runtime_error("Operation not supported, yet"); - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); - build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output_info, Steps()); - - Coordinates coord; - coord.set_num_dimensions(output_info->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t argminmax_axis, ArgOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op)); - - return Status{}; -} - -void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &shape_in = _input->info()->tensor_shape(); - - unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters - - _kernel.setArg<cl_int>(idx++, _argminmax_axis); - _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]); - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup input slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - // Copy output's shape in order to use for recovering at end of this method - const TensorShape shape_out = _output->info()->tensor_shape(); - _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); - - // Recover output's shape of output tensor - _output->info()->set_tensor_shape(shape_out); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp deleted file mode 100644 index 1c505b4d5..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, ConvertPolicy policy) -{ - ARM_COMPUTE_UNUSED(policy); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->data_type() == DataType::U8 && - (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), - "Output can only be U8 if both inputs are U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, - ITensorInfo *output) -{ - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output, out_shape); - - if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) - { - set_format_if_unknown(*output, Format::S16); - } - else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16) - { - set_format_if_unknown(*output, Format::F16); - } - else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) - { - set_format_if_unknown(*output, Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2); - - AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input1->info(), input2->info(), output->info(), policy)); - - // Configure kernel window - auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - - _input1 = input1; - _input2 = input2; - _output = output; - - const bool has_float_out = is_data_type_float(output->info()->data_type()); - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); - build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts)); - - ICLKernel::configure_internal(win_config.second); -} - -Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1, - const ITensorInfo *input2, - const ITensorInfo *output, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), - input2->clone().get(), - output->clone().get()) - .first); - - return Status{}; -} - -void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLArithmeticSubtractionExKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp deleted file mode 100644 index b0016d23c..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const int32_t *block_size) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1, - "Block size should be greater than or equal to 1."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2), - "Input Depth should be equal to Output Depth"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3), - "Input batch should be equal to (output batch * block size[0] *block size[1])"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) && - !(output->dimension(1) % block_size[0]), - "Output height and width should be divisible by block size[0] " - "and block_size[1] respectively"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) && - (output->dimension(1) == input->dimension(1) * block_size[0]), - "Output height and width should be equal to " - "input_height*blocksize[0] and input_width*blocksize[1] " - "respectively"); - - return Status{}; -} - -} // namespace - -CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {} - -void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output, - const int32_t *block_size) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); - - _input = input; - _output = output; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0])); - build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1])); - build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3))); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_out(slice_in); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_out.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_out); - add_4D_tensor_argument(idx, _output, slice_in); - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp deleted file mode 100644 index 3d2f2c702..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, - DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output, BinaryLogicalOperation op) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Create kernel - std::string kernel_name = "binary_logical_op"; - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); - - int op_code = 0; - switch (op) - { - case BinaryLogicalOperation::AND: - op_code = 1; - break; - case BinaryLogicalOperation::OR: - op_code = 2; - break; - default: - throw std::runtime_error("Operation not supported, yet"); - } - - build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); - - AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLBinaryLogicalOpKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp deleted file mode 100644 index bf7ebae3f..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLCastKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {} - -void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - - _input = input; - _output = output; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - // Create kernel - if (is_data_type_quantized_asymmetric(input->info()->data_type())) - { - const float scale_in = input->info()->quantization_info().scale; - const int offset_in = input->info()->quantization_info().offset; - build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); - - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts)); - } - else if (is_data_type_quantized_asymmetric(output->info()->data_type())) - { - const float scale_in = output->info()->quantization_info().scale; - const int offset_in = output->info()->quantization_info().offset; - build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); - - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts)); - } - else - { - _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts)); - } - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); -} - -void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp deleted file mode 100644 index 5af5b16ea..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16, - DataType::S16, DataType::F16, DataType::S32, - DataType::F32, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16, - DataType::S16, DataType::F16, DataType::S32, - DataType::F32, DataType::QASYMM8); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output, const ComparisonOperation &op) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Create kernel - std::string kernel_name = "comparison_op"; - int op_code = 0; - - switch (op) - { - case ComparisonOperation::EQUAL: - op_code = 1; - break; - case ComparisonOperation::NOT_EQUAL: - op_code = 2; - break; - default: - throw std::runtime_error(" Operation not supported, yet"); - } - - std::set<std::string> build_opts; - build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); - build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type()))); - build_opts.emplace( - ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - if (is_data_type_quantized_asymmetric(input1->info()->data_type()) && - ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) || - (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale))) - { - build_opts.emplace("-DOFFSET_IN1=" + - support::cpp11::to_string(input1->info()->quantization_info().offset)); - build_opts.emplace("-DOFFSET_IN2=" + - support::cpp11::to_string(input2->info()->quantization_info().offset)); - build_opts.emplace("-DSCALE_IN1=" + - support::cpp11::to_string(input1->info()->quantization_info().scale)); - build_opts.emplace("-DSCALE_IN2=" + - support::cpp11::to_string(input2->info()->quantization_info().scale)); - kernel_name += "_qasymm8"; - } - - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output->info(), out_shape); - - if (input1->info()->data_type() == DataType::S16 || - input2->info()->data_type() == DataType::S16) - { - set_format_if_unknown(*output->info(), Format::S16); - } - else if (input1->info()->data_type() == DataType::F16 && - input2->info()->data_type() == DataType::F16) - { - set_format_if_unknown(*output->info(), Format::F16); - } - else if (input1->info()->data_type() == DataType::F32 || - input2->info()->data_type() == DataType::F32) - { - set_format_if_unknown(*output->info(), Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); - - AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLComparisonOpKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp deleted file mode 100644 index c386e3312..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const int32_t block_size) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, - "Block size should be greater than or equal to 1."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size, - "Output width should be equal to (Input width * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size, - "Output height should be equal to (Input height * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0, - "Input depth should be divisible by (block size * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->dimension(2) == input->dimension(2) / (block_size * block_size), - "Output depth should be equal to (Input depth / (block size * block size))"); - - return Status{}; -} -} // namespace - -CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) -{ - // DO NOTHING -} - -void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, - const int32_t block_size) -{ - - _input = input; - _output = output; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup input slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp deleted file mode 100644 index 0862b78bf..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - input_access.set_valid_region(win, output->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() - : _input(nullptr), _output(nullptr), _lookups(nullptr) -{ -} - -Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *lookups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); - ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); - - return Status{}; -} - -void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, - const ICLTensor *lookups) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); - - _input = input; - _output = output; - _lookups = lookups; - - // Set kernel build options - std::stringstream kernel_name; - std::set<std::string> build_opts; - kernel_name << "embedding_lookup"; - - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); -} - -void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - Window win_lookup; - win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_in); - add_1D_tensor_argument(idx, _lookups, win_lookup); - - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp deleted file mode 100644 index b1ee21bdc..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLExpKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {} - -void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - // Auto initialize output - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), - input->info()->quantization_info()); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - _input = input; - _output = output; - - constexpr unsigned int num_elems_processed_per_iteration = 4; - - // Create kernel - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); -} - -void CLExpKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp deleted file mode 100644 index ae2801e2b..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLGatherKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 1; - -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32, - DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - - return Status{}; -} - -} // namespace - -CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {} - -void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Construct kernel name - std::string kernel_name = "gather"; - if (input1->info()->num_dimensions() == 1) - { - kernel_name = "gather_1d"; - } - else if (input1->info()->num_dimensions() == 2) - { - if (_output->info()->num_dimensions() == 1) - { - kernel_name = "gather_1d_out"; - } - } - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration)); - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output)); - - return Status{}; -} - -void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - if (_input1->info()->num_dimensions() == 1) - { - Window slice = window.first_slice_window_1D(); - - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input1, slice); - add_1D_tensor_argument(idx, _input2, slice); - add_1D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } - else if (_input1->info()->num_dimensions() == 2) - { - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); - Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX); - - // Set inputs - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input1, window_collapsed); - add_1D_tensor_argument(idx, _input2, slice); - if (_output->info()->num_dimensions() == 1) - { - add_1D_tensor_argument(idx, _output, slice); - } - else - { - add_2D_tensor_argument(idx, _output, window_collapsed); - } - enqueue(queue, *this, slice); - } -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp deleted file mode 100644 index cd7b21c6d..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - input_access.set_valid_region(win, output->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLHashtableLookupKernel::CLHashtableLookupKernel() - : _input(nullptr), _output(nullptr), _lookups(nullptr) -{ -} - -Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, - const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *hits) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Output's shape was not set"); - - ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) && - output->dimension(output->num_dimensions() - 1) == lookups->dimension(0)); - ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); - ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); - ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); - ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); - - return Status{}; -} - -void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, - const ICLTensor *input, ICLTensor *output, ICLTensor *hits) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); - - _lookups = lookups; - _keys = keys; - _input = input; - _output = output; - _hits = hits; - - // Make _lookup_indices tensor - _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>(); - _lookup_indices->allocator()->init( - TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); - _lookup_indices->allocator()->allocate(); - - // Set kernel build options - std::stringstream kernel_name; - std::set<std::string> build_opts; - kernel_name << "hashtable_lookup"; - - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); -} - -void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const_cast<ICLTensor *>(_lookups)->map(queue); - const_cast<ICLTensor *>(_keys)->map(queue); - _hits->map(queue); - _lookup_indices->map(queue); - - // Set values of hits - const int32_t *lookups_buf = - reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); - const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); - uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); - int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); - - std::map<int32_t, size_t> key_map; - const size_t keys_num = _keys->info()->dimension(0); - for (size_t key_index = 0; key_index < keys_num; key_index++) - { - key_map[keys_buf[key_index]] = key_index; - } - - const size_t lookups_num = _lookups->info()->dimension(0); - for (size_t i = 0; i < lookups_num; ++i) - { - const auto lookup_value = lookups_buf[i]; - const auto it = key_map.find(lookup_value); - if (it != key_map.end()) - { -#if defined(DEBUG) - if (it->second >= lookups_num) - ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); -#endif // defined(DEBUG) - lookup_indices_buf[i] = static_cast<int32_t>(it->second); - hits_buf[i] = static_cast<uint8_t>(1); - } - else - { - lookup_indices_buf[i] = -1; - hits_buf[i] = static_cast<uint8_t>(0); - } - } - - const_cast<ICLTensor *>(_lookups)->unmap(queue); - const_cast<ICLTensor *>(_keys)->unmap(queue); - _hits->unmap(queue); - _lookup_indices->unmap(queue); - - Window win = window.collapse(ICLKernel::window(), 2, 4); - - Window win_lookup; - win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, win); - add_4D_tensor_argument(idx, _output, win); - add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); - - enqueue(queue, *this, win); - } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp deleted file mode 100644 index 80d99dd3b..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLNegKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(), - output->info()->tensor_shape()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - return Status{}; -} - -} // namespace - -CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} - -void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); - - _input = input; - _output = output; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Create kernel - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); - - // Configure window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); -} - -void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp deleted file mode 100644 index 12bbe910f..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - NormalizationLayerInfo norm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - - // Checks performed when output is configured - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, - NormalizationLayerInfo norm_info) -{ - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, *input->clone()); - - const unsigned int norm_size = norm_info.norm_size(); - bool is_in_map = norm_info.is_in_map(); - - const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0; - const BorderSize border_size = BorderSize(0, border_width); - - const unsigned int num_elems_processed_per_iteration = 4; - const unsigned int num_elems_read_per_iteration = - is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) - : num_elems_processed_per_iteration; - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside - // the kernel, avoiding padding - AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLNormalizationLayerExKernel::CLNormalizationLayerExKernel() - : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false) -{ -} - -BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; } - -void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output, - NormalizationLayerInfo norm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info)); - - _input = input; - _output = output; - - const unsigned int num_elems_processed_per_iteration = 4; - const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.add_option( - ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff()))); - build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta()))); - build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa()))); - build_opts.add_option( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size()))); - build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2)))); - build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D"); - - // Create kernel - std::string kernel_name = - _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map"; - _kernel = static_cast<cl::Kernel>( - CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Set config_id for enabling LWS tuning - _config_id = "normalization_layer_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string( - static_cast<std::underlying_type<NormType>::type>(norm_info.type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(norm_info.norm_size()); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - NormalizationLayerInfo norm_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); - - return Status{}; -} - -void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const int collapsed_dimension = _is_in_map ? Window::DimZ : 4; - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension); - Window slice = window_collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } while (window_collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp deleted file mode 100644 index 241f8ae4d..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, - DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, - DataType::QASYMM8); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} - -void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info())); - - _input = input; - _alpha = alpha; - _output = output; - - // Create kernel - std::string kernel_name = "prelu"; - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - if (is_data_type_quantized_asymmetric(input->info()->data_type())) - { - build_opts.emplace("-DOFF_IN1=" + - support::cpp11::to_string(input->info()->quantization_info().offset)); - build_opts.emplace("-DOFF_IN2=" + - support::cpp11::to_string(alpha->info()->quantization_info().offset)); - build_opts.emplace("-DOFF_OUT=" + - support::cpp11::to_string(output->info()->quantization_info().offset)); - build_opts.emplace("-DSCALE_IN1=" + - support::cpp11::to_string(input->info()->quantization_info().scale)); - build_opts.emplace("-DSCALE_IN2=" + - support::cpp11::to_string(alpha->info()->quantization_info().scale)); - build_opts.emplace("-DSCALE_OUT=" + - support::cpp11::to_string(output->info()->quantization_info().scale)); - kernel_name += "_qasymm8"; - } - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output->info(), out_shape); - - if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) - { - set_format_if_unknown(*output->info(), Format::F16); - } - else if (input->info()->data_type() == DataType::F32 || - alpha->info()->data_type() == DataType::F32) - { - set_format_if_unknown(*output->info(), Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); - - AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input->info()->tensor_shape(); - const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_input1); - add_3D_tensor_argument(idx, _alpha, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLPReLUKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp deleted file mode 100644 index 99b54c822..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info, - const ITensorInfo *pad_size_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 && - input_info->num_dimensions() <= 4, - "Pad kernel supports upto 4-D input tensor"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - input_info->num_dimensions() == output_info->num_dimensions(), - "output tensor should have same number of dimensions as input tensor"); - - if (input_info->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() != - output_info->quantization_info(), - "The input and output quantization info are different!"); - } - - return Status{}; -} - -} // namespace - -CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {} - -void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info())); - - _input = input; - _output = output; - _pad_size = pad_size; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3))); - build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0))); - build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1))); - build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2))); - if (input->info()->data_type() == DataType::QASYMM8) - { - build_opts.emplace("-DZERO_VALUE=" + - support::cpp11::to_string(input->info()->quantization_info().offset)); - } - else - { - build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); - } - - // Create kernel - _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - _pad_size->map(queue); - - // Padding values only for up, top, left and front are required based on the rank of tensor - int rank = _pad_size->info()->dimension(1); - - auto pad_batch_up = - (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0; - auto pad_height_top = - (rank >= 2) - ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1})) - : 0; - auto pad_width_left = (rank >= 1) - ? *reinterpret_cast<const int32_t *>( - _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1})) - : 0; - auto pad_depth_front = - (rank >= 3) - ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3})) - : 0; - - _pad_size->unmap(queue); - - // Pad_values which needs to be passed - const cl_int4 paddingValues = { - {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top), - static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}}; - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - _kernel.setArg<cl_int4>(idx++, paddingValues); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp deleted file mode 100644 index aa094761c..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -using namespace arm_compute; - -namespace -{ -TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm) -{ - TensorShape output_shape = input->tensor_shape(); - permute(output_shape, perm); - return output_shape; -} - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - - const TensorShape output_shape = - misc::shape_calculator::compute_permutation_output_shape(*input, perm); - - // Validate configured output - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - return Status{}; -} -} // namespace - -CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {} - -void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output, - const PermutationVector &perm) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm)); - - _input = input; - _output = output; - _perm = perm; - - const TensorShape output_shape = get_output_shape(input->info(), perm); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - - // Create kernel - std::set<std::string> build_opts; - - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); - - // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector - build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0])); - build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1])); - build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2])); - build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3])); - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); - - // The CLPermute doesn't need padding so update_window_and_padding() can be skipped - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm)); - - return Status{}; -} - -void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_out(slice_in); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_out.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp deleted file mode 100644 index b985aa737..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_UNUSED(overflow_policy); - ARM_COMPUTE_UNUSED(rounding_policy); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); - - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->data_type() == DataType::U8 && - (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), - "Output can only be U8 if both inputs are U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, - ITensorInfo *output) -{ - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output, out_shape); - - if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) - { - set_format_if_unknown(*output, Format::S16); - } - else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) - { - set_format_if_unknown(*output, Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2); - - AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output, float scale, - ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), - scale, overflow_policy, rounding_policy)); - - // Configure kernel window - auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - - _input1 = input1; - _input2 = input2; - _output = output; - - int scale_int = -1; - // Extract sign, exponent and mantissa - int exponent = 0; - float normalized_mantissa = std::frexp(scale, &exponent); - // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 - // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= - // 14 - // Moreover, it will be negative as we deal with 1/2^n - if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) - { - // Store the positive exponent. We know that we compute 1/2^n - // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 - scale_int = std::abs(exponent - 1); - } - - std::string data_type; - std::string compute_type; - // Check if it has float inputs and output - if (is_data_type_float(input1->info()->data_type()) || - is_data_type_float(input2->info()->data_type())) - { - scale_int = -1; - compute_type = (input1->info()->data_type() == DataType::F32 || - input2->info()->data_type() == DataType::F32) - ? "float" - : "half"; - data_type = "DATA_TYPE_FLOAT"; - } - else - { - if (input1->info()->data_type() == DataType::S16 || - input2->info()->data_type() == DataType::S16) - { - compute_type = "int"; - } - else - { - compute_type = "ushort"; - } - data_type = "DATA_TYPE_INT"; - } - - // Construct kernel name - std::string kernel_name = "pixelwise_div"; - kernel_name += (scale_int >= 0) ? "_int" : "_float"; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace( - (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) - ? "-DWRAP" - : "-DSATURATE"); - build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" - : "-DROUND=_rte"); - build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_RES=" + compute_type); - build_opts.emplace("-D" + data_type); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Set scale argument - unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters - - if (scale_int >= 0) - { - _kernel.setArg(idx++, scale_int); - } - else - { - _kernel.setArg(idx++, scale); - } - - ICLKernel::configure_internal(win_config.second); -} - -Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, float scale, - ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), - input2->clone().get(), - output->clone().get()) - .first); - - return Status{}; -} - -void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLPixelWiseDivisionKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp deleted file mode 100644 index f581780e1..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; -namespace -{ -// NOTE This is necessary because it is not guaranteed that the axis positions of input and output -// are the same. -const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) -{ - TensorShape out_shape{input_shape}; - - out_shape.set(axis, 1); - - return out_shape; -} -} // namespace - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, - ReduceOperation op) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, - DataType::F32, DataType::S32); - if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, - "Not support QASYMM8, yet"); - } - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Inputs are not broadcast compatible"); - - const auto num_dimensions = input->tensor_shape().num_dimensions(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - axis >= 0 && axis < num_dimensions, - "axis must be greater than or equal to 0 and less than (input's rank)."); - - const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), - "output shape's size does not match axis"); - - return Status{}; -} -} // namespace - -CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} - -void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, - const uint32_t axis, ReduceOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - - _input = input; - _output = output; - _axis = axis; - - std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); - output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); - - // Construct kernel name - std::string kernel_name; - int op_code = 0; - if (op == ReduceOperation::MAX) - { - kernel_name = "reduce_min_max"; - op_code = 1; - } - else if (op == ReduceOperation::MIN) - { - kernel_name = "reduce_min_max"; - op_code = 2; - } - else if (op == ReduceOperation::SUM) - { - kernel_name = "reduce_sum_mean"; - op_code = 3; - } - else if (op == ReduceOperation::MEAN) - { - kernel_name = "reduce_sum_mean"; - op_code = 4; - } - else - throw std::runtime_error("Operation not supported, yet"); - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); - build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output_info, Steps()); - - Coordinates coord; - coord.set_num_dimensions(output_info->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t axis, ReduceOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); - - return Status{}; -} - -void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &shape_in = _input->info()->tensor_shape(); - - unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters - - _kernel.setArg<cl_int>(idx++, _axis); - _kernel.setArg<cl_int>(idx++, shape_in[_axis]); - - // Support dimensions up to 4 - Window slice_out = window.collapse(ICLKernel::window(), 2, 4); - - // Setup input slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - // Copy output's shape in order to use for recovering at end of this method - // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions - // of input and output are the same - const TensorShape shape_out = _output->info()->tensor_shape(); - _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); - - idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - - // Recover output's shape of output tensor - _output->info()->set_tensor_shape(shape_out); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp deleted file mode 100644 index 6b0697e89..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size, - const ITensorInfo *padding_size, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::F16, DataType::S32, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::F16, DataType::S32, - DataType::F32); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(), - "The number of dimensions of input should be equal to output"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(), - "The input and output layouts are different!"); - - // TODO Support other cases - if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2), - "Input Depth should be equal to Output Depth"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || - padding_size->dimension(1) != 2, - "Only 2-dimensional spatial block's size was wrong"); - } - else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0), - "Input Depth should be equal to Output Depth"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || - padding_size->dimension(1) != 2, - "Only 2-dimensional spatial block's size was wrong"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input"); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4, - "CLSpaceToBatchNDKernel supports dimensions up to 4"); - - if (input->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(), - "The input and output quantization info are different!"); - } - - return Status{}; -} - -} // namespace - -CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {} - -void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size, - const ICLTensor *padding_size, ICLTensor *output) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info())); - - _input = input; - _block_size = block_size; - _padding_size = padding_size; - _output = output; - - // Set kernel build options - // TODO Support other cases - std::string kernel_name = "space_to_batch_4d"; - std::set<std::string> build_opts; - Window win; - - if (input->info()->data_layout() == DataLayout::NCHW) - { - kernel_name += "_nchw"; - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1))); - build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0))); - - win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - } - else if (input->info()->data_layout() == DataLayout::NHWC) - { - kernel_name += "_nhwc"; - build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2))); - build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1))); - build_opts.emplace("-DVEC_SIZE=" + - support::cpp11::to_string(num_elems_processed_per_iteration)); - - win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - input_access.set_valid_region(win, output->info()->valid_region()); - - if (window_changed) - { - ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!"); - } - } - else - { - ARM_COMPUTE_ERROR("Unsupported layout"); - } - - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3))); - if (input->info()->data_type() == DataType::QASYMM8) - { - build_opts.emplace("-DZERO_VALUE=" + - support::cpp11::to_string(input->info()->quantization_info().offset)); - } - else - { - build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); - } - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - ICLKernel::configure_internal(win); -} - -void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - -#if defined(DEBUG) - const_cast<ICLTensor *>(_block_size)->map(queue); - const_cast<ICLTensor *>(_padding_size)->map(queue); - - const size_t num_dimensions = _input->info()->num_dimensions(); - const size_t num_spacial_dimensions = _block_size->info()->dimension(0); - int32_t batch_size = _input->info()->dimension(num_dimensions - 1); - for (size_t i = 0; i < num_spacial_dimensions; ++i) - { - const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i})); - const int32_t padding_size_pre = - *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i})); - const int32_t padding_size_post = - *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i})); - - ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1"); - ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0, - "Padding size should be greater than or equal to 0"); - - if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW) - { - ARM_COMPUTE_ERROR_ON_MSG( - _output->info()->dimension(i) != - (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size, - "Dimension value of spatial block does not match output's dimension value"); - } - else - { - ARM_COMPUTE_ERROR_ON_MSG( - _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) != - (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) + - padding_size_pre + padding_size_post) / - block_size, - "Dimension value of spatial block does not match output's dimension value"); - } - - batch_size *= block_size; - } - ARM_COMPUTE_ERROR_ON_MSG( - _output->info()->dimension(num_dimensions - 1) != batch_size, - "Output batch size should be equal to input batch size * (multiplication of all block size)"); - - const_cast<ICLTensor *>(_block_size)->unmap(queue); - const_cast<ICLTensor *>(_padding_size)->unmap(queue); -#endif // defined(DEBUG) - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - // Set block size window - Window win_block = calculate_max_window(*_block_size->info(), Steps()); - - // Set padding size window - Window win_padding = calculate_max_window(*_padding_size->info(), Steps()); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - add_1D_tensor_argument(idx, _block_size, win_block); - add_2D_tensor_argument(idx, _padding_size, win_padding); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp deleted file mode 100644 index 5d6329edc..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const int32_t block_size) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, - "Block size should be greater than or equal to 1."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3), - "Input batch should be equal to Output batch"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - input->dimension(2) * block_size * block_size == output->dimension(2), - "Output depth should be equal to (input depth * block size *block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) && - !(input->dimension(1) % block_size), - "Input height and width should be divisible by block size"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) && - (output->dimension(1) == (input->dimension(1) / block_size)), - "Output height and width should be equal to " - "input_height/blocksize and input_width/blocksize respectively"); - - return Status{}; -} - -} // namespace - -CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} - -void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, - const int32_t block_size) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); - - _input = input; - _output = output; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); - build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_out(slice_in); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_out.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp deleted file mode 100644 index 260bc39f1..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLSquaredDifferenceKernel::CLSquaredDifferenceKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Create kernel - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts)); - - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output->info(), out_shape); - - if (input1->info()->data_type() == DataType::F16 && - input2->info()->data_type() == DataType::F16) - { - set_format_if_unknown(*output->info(), Format::F16); - } - else if (input1->info()->data_type() == DataType::F32 || - input2->info()->data_type() == DataType::F32) - { - set_format_if_unknown(*output->info(), Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); - - AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLSquaredDifferenceKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp deleted file mode 100644 index 48146a43a..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/TensorInfo.h" - -using namespace arm_compute; - -CLStridedSliceExKernel::CLStridedSliceExKernel() - : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), - _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0) -{ -} - -Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *begin, const ITensorInfo *end, - const ITensorInfo *strides, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(), - strides->tensor_shape()); - - return Status{}; -} - -// Return the index for the first element along that axis. This index will be a -// positive integer between [0, axisSize - 1] that can be used to index -// directly into the data. -inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride, - const TensorShape &inputShape, int32_t axis) -{ - // Begin with the specified index - int32_t start = begin; - - // beginMask override - if (beginMask & 1 << axis) - { - if (stride > 0) - { - // Forward iteration - use the first element. These values will get - // clamped below (Note: We could have set them to 0 and axisSize-1, but - // use lowest() and max() to maintain symmetry with StopForAxis()) - start = std::numeric_limits<int32_t>::lowest(); - } - else - { - // Backward iteration - use the last element. - start = std::numeric_limits<int32_t>::max(); - } - } - - // Handle negative indices - int32_t axisSize = inputShape[axis]; - if (start < 0) - { - start += axisSize; - } - - // Clamping - start = arm_compute::utility::clamp(start, 0, axisSize - 1); - - return start; -} - -// Return the "real" index for the end of iteration along that axis. This is an -// "end" in the traditional C sense, in that it points to one past the last -// element. ie. So if you were iterating through all elements of a 1D array of -// size 4, this function would return 4 as the stop, because it is one past the -// "real" indices of 0, 1, 2 & 3. -inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, - const TensorShape &inputShape, int32_t axis) -{ - // Begin with the specified index - int32_t stop = end; - - // endMask override - if (endMask & (1 << axis)) - { - if (stride > 0) - { - // Forward iteration - use the last element. These values will get - // clamped below - stop = std::numeric_limits<int32_t>::max(); - } - else - { - // Backward iteration - use the first element. - stop = std::numeric_limits<int32_t>::lowest(); - } - } - - // Handle negative indices - int32_t axisSize = inputShape[axis]; - if (stop < 0) - { - stop += axisSize; - } - - // Clamping - // Because the end index points one past the last element, we need slightly - // different clamping ranges depending on the direction. - if (stride > 0) - { - // Forward iteration - stop = arm_compute::utility::clamp(stop, 0, axisSize); - } - else - { - // Backward iteration - stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); - } - - return stop; -} - -inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) -{ - int32_t ret = 0; - if (stride > 0) - { - ret = ((stop - start - 1) / stride) + 1; - } - else - { - ret = ((stop - start + 1) / stride) + 1; - } - ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number"); - return ret; -} - -void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output, - ICLTensor *beginData, ICLTensor *endData, - ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), - endData->info(), stridesData->info(), beginMask, endMask, - shrinkAxisMask)); - - _input = input; - _output = output; - _beginData = beginData; - _endData = endData; - _stridesData = stridesData; - _beginMask = beginMask; - _endMask = endMask; - _shrinkAxisMask = shrinkAxisMask; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DELEMENT_DATA_TYPE=" + - get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - ICLKernel::configure_internal(win); -} - -void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - _beginData->map(queue); - _endData->map(queue); - _stridesData->map(queue); - - std::vector<int32_t> starts; - std::vector<int32_t> strides; - - for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n) - { - const TensorShape shape = _input->info()->tensor_shape(); - starts.emplace_back( - StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n], - reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n)); - - strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]); - } - - for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) - { - starts.emplace_back(0); - strides.emplace_back(1); - } - // TODO: Apply shrinkAxisMask - - _beginData->unmap(queue); - _stridesData->unmap(queue); - _endData->unmap(queue); - - unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters - const cl_int4 startsArg = {{ - static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]), - static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]), - }}; - _kernel.setArg<cl_int4>(idx++, startsArg); - - const cl_int4 stridesArg = {{ - static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]), - static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]), - }}; - _kernel.setArg<cl_int4>(idx++, stridesArg); - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp deleted file mode 100644 index 073c2f7bb..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp +++ /dev/null @@ -1,468 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -namespace arm_compute -{ -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {} - -void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, - cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n) -{ - ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr); - ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - _input = input; - _topk_values = topk_values; - _topk_indices = topk_indices; - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts)); - - unsigned int idx = 3 * num_arguments_per_1D_tensor(); - _kernel.setArg(idx++, *indices); - _kernel.setArg(idx++, *temp_stack); - _kernel.setArg<cl_int>(idx++, k); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, 1, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, window); - add_1D_tensor_argument(idx, _topk_values, window); - add_1D_tensor_argument(idx, _topk_indices, window); - - enqueue(queue, *this, window); -} - -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {} - -void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, - int n) -{ - ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); - ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - _input = input; - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts)); - - unsigned int idx = num_arguments_per_1D_tensor(); - _kernel.setArg(idx++, *in_key_buf); - _kernel.setArg(idx++, *in_ind_buf); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, window); - - enqueue(queue, *this, window); -} - -//////////////////////////////////////////////////////////////////////////////// -// This kernel makes a histogram of radix for each work item. -CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {} - -void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) -{ - ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts)); - - int loc_histo_size = radix * _ITEMS * sizeof(cl_int); - - unsigned int idx = 1; - _kernel.setArg(idx++, *hist_buf); - - idx = 3; - _kernel.setArg(idx++, loc_histo_size, nullptr); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - _kernel.setArg(0, *_in_key_buf); - _kernel.setArg<cl_int>(2, _pass); - - cl::NDRange lws = cl::NDRange(_ITEMS, 1); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLRadixSortScanHistogram::CLRadixSortScanHistogram() {} - -void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) -{ - ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); - - int temp_size = - std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); - - unsigned int idx = 0; - _kernel.setArg(idx++, *hist_buf); - _kernel.setArg(idx++, temp_size, nullptr); - _kernel.setArg(idx++, *glob_sum_buf); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {} - -void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, - int bits) -{ - ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); - - int temp_size = - std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); - - unsigned int idx = 0; - _kernel.setArg(idx++, *glob_sum_buf); - _kernel.setArg(idx++, temp_size, nullptr); - _kernel.setArg(idx++, *temp_buf); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - cl::NDRange lws = cl::NDRange(gws_x, 1); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {} - -void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) -{ - ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts)); - - unsigned int idx = 0; - _kernel.setArg(idx++, *hist_buf); - _kernel.setArg(idx++, *glob_sum_buf); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLRadixSortReorder::CLRadixSortReorder() - : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), - _out_ind_buf(nullptr) -{ -} - -void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) -{ - ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts)); - - unsigned int idx = 2; - _kernel.setArg(idx++, *hist_buf); - - idx = 6; - _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); - cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); - - _kernel.setArg(0, *_in_key_buf); - _kernel.setArg(1, *_out_key_buf); - _kernel.setArg<cl_int>(3, _pass); - _kernel.setArg(4, *_in_ind_buf); - _kernel.setArg(5, *_out_ind_buf); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {} - -void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) -{ - ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts)); - - unsigned int idx = 1; - _kernel.setArg(idx++, *first_negative_idx_buf); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - _kernel.setArg(idx++, *_out_key_buf); - - enqueue(queue, *this, window); -} - -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() - : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr) -{ -} - -void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) -{ - ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts)); - - unsigned int idx = 4; - _kernel.setArg(idx++, *first_negative_idx_buf); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - _kernel.setArg(idx++, *_in_key_buf); - _kernel.setArg(idx++, *_out_key_buf); - _kernel.setArg(idx++, *_in_ind_buf); - _kernel.setArg(idx++, *_out_ind_buf); - - enqueue(queue, *this, window); -} - -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2Store::CLTopKV2Store() - : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) -{ -} - -void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) -{ - ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); - ARM_COMPUTE_ERROR_ON(k == 0); - ARM_COMPUTE_ERROR_ON(k > n); - - _values = values; - _indices = indices; - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts)); - - unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, k, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) -{ - _out_key_buf = out_key_buf; - _out_ind_buf = out_ind_buf; -} - -void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - add_1D_tensor_argument(idx, _values, window); - add_1D_tensor_argument(idx, _indices, window); - _kernel.setArg(idx++, *_out_key_buf); - _kernel.setArg(idx++, *_out_ind_buf); - - enqueue(queue, *this, window); -} - -} // namespace arm_compute |