diff options
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/kernels')
26 files changed, 3042 insertions, 507 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp new file mode 100644 index 000000000..1fdd2f98f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/UtilsEx.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::F16, DataType::F32); + + // Checks performed when output is configured + if ((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + if (output != nullptr) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, *input); + } + + const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + bool window_changed = false; + + if (output != nullptr) + { + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + else + { + window_changed = update_window_and_padding( + win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); + } + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLActivationLayerExKernel::CLActivationLayerExKernel() + : _input(nullptr), _output(nullptr), _run_in_place(false) +{ +} + +void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output, + ActivationLayerInfoEx act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _run_in_place = (output == nullptr) || (output == input); + + if (output != nullptr) + { + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), *input->info()->clone()); + } + + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info)); + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const DataType dt = input->info()->data_type(); + float a_const = act_info.a(); + float b_const = act_info.b(); + int a_const_int = 0; + int b_const_int = 0; + + // Create quantized version of constants a, b if needed + if (is_data_type_quantized(dt)) + { + a_const_int = + input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP); + b_const_int = + input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP); + } + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace( + ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation())))); + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized(dt)) + { + build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int))); + build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int))); + + const int o1 = input->info()->quantization_info().offset; + // Quantized value of 0 corresponds to the offset o1 + build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1))); + + // Set scale and offset of the input and output if they have different quantization info + if (is_data_type_quantized_asymmetric(dt) && output != nullptr) + { + const float s1 = input->info()->quantization_info().scale; + const float s2 = output->info()->quantization_info().scale; + const int o2 = output->info()->quantization_info().offset; + + if (o1 != o2 || s1 != s2) + { + build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1))); + build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2))); + build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1))); + build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2))); + } + } + } + else + { + build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); + build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); + } + + build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : ""); + + // Create kernel + std::string kernel_name = std::string("activation_layer_ex"); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Make sure _kernel is initialized before calling the parent's configure + _input = input; + _output = output; + + // Configure kernel window + auto win_config = + validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "activation_layer_ex_"; + _config_id += lower_string(string_from_data_type(dt)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info) +{ + const bool run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), + (run_in_place) ? nullptr : output->clone().get()) + .first); + + return Status{}; +} + +void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + if (!_run_in_place) + { + add_3D_tensor_argument(idx, _output, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp new file mode 100644 index 000000000..c1a2ad0be --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(argminmax_axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32, + DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match argminmax_axis"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + argminmax_axis >= 0 && argminmax_axis < num_dimensions, + "argminmax_axis must be greater than or equal to 0 and less than (input's rank)."); + return Status{}; +} + +} // namespace + +CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {} + +void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis)); + + _input = input; + _output = output; + _argminmax_axis = argminmax_axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis)); + + // Construct kernel name for argmax and argmin based on axis + std::string kernel_name = "arg_op"; + int op_code = 0; + if (op == ArgOperation::MAX) + { + op_code = 1; + } + else if (op == ArgOperation::MIN) + { + op_code = 2; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op)); + + return Status{}; +} + +void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _argminmax_axis); + _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp new file mode 100644 index 000000000..1c505b4d5 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->data_type() == DataType::U8 && + (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, + ITensorInfo *output) +{ + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output, out_shape); + + if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) + { + set_format_if_unknown(*output, Format::S16); + } + else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16) + { + set_format_if_unknown(*output, Format::F16); + } + else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) + { + set_format_if_unknown(*output, Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2); + + AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input1->info(), input2->info(), output->info(), policy)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input1 = input1; + _input2 = input2; + _output = output; + + const bool has_float_out = is_data_type_float(output->info()->data_type()); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts)); + + ICLKernel::configure_internal(win_config.second); +} + +Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), + input2->clone().get(), + output->clone().get()) + .first); + + return Status{}; +} + +void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLArithmeticSubtractionExKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp new file mode 100644 index 000000000..b0016d23c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t *block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2), + "Input Depth should be equal to Output Depth"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3), + "Input batch should be equal to (output batch * block size[0] *block size[1])"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) && + !(output->dimension(1) % block_size[0]), + "Output height and width should be divisible by block size[0] " + "and block_size[1] respectively"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) && + (output->dimension(1) == input->dimension(1) * block_size[0]), + "Output height and width should be equal to " + "input_height*blocksize[0] and input_width*blocksize[1] " + "respectively"); + + return Status{}; +} + +} // namespace + +CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {} + +void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t *block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0])); + build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1])); + build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3))); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_out); + add_4D_tensor_argument(idx, _output, slice_in); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp new file mode 100644 index 000000000..3d2f2c702 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, BinaryLogicalOperation op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "binary_logical_op"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + + int op_code = 0; + switch (op) + { + case BinaryLogicalOperation::AND: + op_code = 1; + break; + case BinaryLogicalOperation::OR: + op_code = 2; + break; + default: + throw std::runtime_error("Operation not supported, yet"); + } + + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLBinaryLogicalOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp index b019e8c33..bf7ebae3f 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp @@ -17,15 +17,8 @@ #include "arm_compute/core/CL/kernels/CLCastKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" using namespace arm_compute; @@ -60,8 +53,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) { const float scale_in = input->info()->quantization_info().scale; const int offset_in = input->info()->quantization_info().offset; - build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); + build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); _kernel = static_cast<cl::Kernel>( CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts)); @@ -70,8 +63,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) { const float scale_in = output->info()->quantization_info().scale; const int offset_in = output->info()->quantization_info().offset; - build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); + build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); _kernel = static_cast<cl::Kernel>( CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts)); @@ -88,7 +81,7 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) update_window_and_padding(win, input_access, output_access); output_access.set_valid_region(win, input->info()->valid_region()); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp new file mode 100644 index 000000000..5af5b16ea --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, const ComparisonOperation &op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "comparison_op"; + int op_code = 0; + + switch (op) + { + case ComparisonOperation::EQUAL: + op_code = 1; + break; + case ComparisonOperation::NOT_EQUAL: + op_code = 2; + break; + default: + throw std::runtime_error(" Operation not supported, yet"); + } + + std::set<std::string> build_opts; + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type()))); + build_opts.emplace( + ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input1->info()->data_type()) && + ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) || + (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale))) + { + build_opts.emplace("-DOFFSET_IN1=" + + support::cpp11::to_string(input1->info()->quantization_info().offset)); + build_opts.emplace("-DOFFSET_IN2=" + + support::cpp11::to_string(input2->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN1=" + + support::cpp11::to_string(input1->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_IN2=" + + support::cpp11::to_string(input2->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input1->info()->data_type() == DataType::S16 || + input2->info()->data_type() == DataType::S16) + { + set_format_if_unknown(*output->info(), Format::S16); + } + else if (input1->info()->data_type() == DataType::F16 && + input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input1->info()->data_type() == DataType::F32 || + input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLComparisonOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp new file mode 100644 index 000000000..c386e3312 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size, + "Output width should be equal to (Input width * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size, + "Output height should be equal to (Input height * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0, + "Input depth should be divisible by (block size * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(2) == input->dimension(2) / (block_size * block_size), + "Output depth should be equal to (Input depth / (block size * block size))"); + + return Status{}; +} +} // namespace + +CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..0862b78bf --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + return Status{}; +} + +void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp new file mode 100644 index 000000000..b1ee21bdc --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLExpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {} + +void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), + input->info()->quantization_info()); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 4; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLExpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp index 23efafa6a..ae2801e2b 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp @@ -17,26 +17,14 @@ #include "arm_compute/core/CL/kernels/CLGatherKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> using namespace arm_compute; namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; +constexpr unsigned int num_elems_processed_per_iteration = 1; Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) @@ -46,6 +34,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); return Status{}; } @@ -57,8 +46,7 @@ CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(n void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); _input1 = input1; _input2 = input2; @@ -89,11 +77,10 @@ void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); // Configure kernel window - const unsigned int num_elems_processed_per_iteration = 1; Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration)); output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp new file mode 100644 index 000000000..cd7b21c6d --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLHashtableLookupKernel::CLHashtableLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Output's shape was not set"); + + ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) && + output->dimension(output->num_dimensions() - 1) == lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + + return Status{}; +} + +void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Make _lookup_indices tensor + _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const_cast<ICLTensor *>(_lookups)->map(queue); + const_cast<ICLTensor *>(_keys)->map(queue); + _hits->map(queue); + _lookup_indices->map(queue); + + // Set values of hits + const int32_t *lookups_buf = + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); + + std::map<int32_t, size_t> key_map; + const size_t keys_num = _keys->info()->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const size_t lookups_num = _lookups->info()->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { +#if defined(DEBUG) + if (it->second >= lookups_num) + ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); +#endif // defined(DEBUG) + lookup_indices_buf[i] = static_cast<int32_t>(it->second); + hits_buf[i] = static_cast<uint8_t>(1); + } + else + { + lookup_indices_buf[i] = -1; + hits_buf[i] = static_cast<uint8_t>(0); + } + } + + const_cast<ICLTensor *>(_lookups)->unmap(queue); + const_cast<ICLTensor *>(_keys)->unmap(queue); + _hits->unmap(queue); + _lookup_indices->unmap(queue); + + Window win = window.collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp new file mode 100644 index 000000000..80d99dd3b --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(), + output->info()->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + return Status{}; +} + +} // namespace + +CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} + +void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp new file mode 100644 index 000000000..12bbe910f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + + // Checks performed when output is configured + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, *input->clone()); + + const unsigned int norm_size = norm_info.norm_size(); + bool is_in_map = norm_info.is_in_map(); + + const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0; + const BorderSize border_size = BorderSize(0, border_width); + + const unsigned int num_elems_processed_per_iteration = 4; + const unsigned int num_elems_read_per_iteration = + is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) + : num_elems_processed_per_iteration; + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside + // the kernel, avoiding padding + AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLNormalizationLayerExKernel::CLNormalizationLayerExKernel() + : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false) +{ +} + +BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; } + +void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), *input->info()->clone()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info)); + + _input = input; + _output = output; + + const unsigned int num_elems_processed_per_iteration = 4; + const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option( + ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff()))); + build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta()))); + build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa()))); + build_opts.add_option( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size()))); + build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2)))); + build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D"); + + // Create kernel + std::string kernel_name = + _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "normalization_layer_"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string( + static_cast<std::underlying_type<NormType>::type>(norm_info.type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(norm_info.norm_size()); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); + + return Status{}; +} + +void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const int collapsed_dimension = _is_in_map ? Window::DimZ : 4; + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension); + Window slice = window_collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } while (window_collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp new file mode 100644 index 000000000..241f8ae4d --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info())); + + _input = input; + _alpha = alpha; + _output = output; + + // Create kernel + std::string kernel_name = "prelu"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + build_opts.emplace("-DOFF_IN1=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_IN2=" + + support::cpp11::to_string(alpha->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN1=" + + support::cpp11::to_string(input->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_IN2=" + + support::cpp11::to_string(alpha->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input->info()->data_type() == DataType::F32 || + alpha->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); + + AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input->info()->tensor_shape(); + const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_input1); + add_3D_tensor_argument(idx, _alpha, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLPReLUKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp new file mode 100644 index 000000000..99b54c822 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info, + const ITensorInfo *pad_size_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 && + input_info->num_dimensions() <= 4, + "Pad kernel supports upto 4-D input tensor"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input_info->num_dimensions() == output_info->num_dimensions(), + "output tensor should have same number of dimensions as input tensor"); + + if (input_info->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() != + output_info->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {} + +void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info())); + + _input = input; + _output = output; + _pad_size = pad_size; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3))); + build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + _pad_size->map(queue); + + // Padding values only for up, top, left and front are required based on the rank of tensor + int rank = _pad_size->info()->dimension(1); + + auto pad_batch_up = + (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0; + auto pad_height_top = + (rank >= 2) + ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1})) + : 0; + auto pad_width_left = (rank >= 1) + ? *reinterpret_cast<const int32_t *>( + _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1})) + : 0; + auto pad_depth_front = + (rank >= 3) + ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3})) + : 0; + + _pad_size->unmap(queue); + + // Pad_values which needs to be passed + const cl_int4 paddingValues = { + {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top), + static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}}; + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + _kernel.setArg<cl_int4>(idx++, paddingValues); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp new file mode 100644 index 000000000..aa094761c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +using namespace arm_compute; + +namespace +{ +TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm) +{ + TensorShape output_shape = input->tensor_shape(); + permute(output_shape, perm); + return output_shape; +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + const TensorShape output_shape = + misc::shape_calculator::compute_permutation_output_shape(*input, perm); + + // Validate configured output + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + return Status{}; +} +} // namespace + +CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {} + +void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm)); + + _input = input; + _output = output; + _perm = perm; + + const TensorShape output_shape = get_output_shape(input->info(), perm); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + + // Create kernel + std::set<std::string> build_opts; + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + + // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector + build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0])); + build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1])); + build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2])); + build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3])); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + // The CLPermute doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm)); + + return Status{}; +} + +void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp index a3e0163de..b985aa737 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp @@ -17,20 +17,8 @@ #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> using namespace arm_compute; @@ -45,12 +33,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, DataType::F16, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); const TensorShape &out_shape = @@ -58,21 +44,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2); - - if (is_data_type_fixed_point(input1->data_type())) - { - // All data types must be all QS8 or all QS16 - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1, - "Unsupported scaling factor for QS8/QS16. Scale must be 1."); - } // Validate in case of configured output if (output->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG( output->data_type() == DataType::U8 && @@ -81,11 +57,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG( detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output); - if (is_data_type_fixed_point(input1->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - } } return Status{}; @@ -191,14 +162,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens { compute_type = "int"; } - else if (input1->info()->data_type() == DataType::QS8) - { - compute_type = "qs8"; - } - else if (input1->info()->data_type() == DataType::QS16) - { - compute_type = "qs16"; - } else { compute_type = "ushort"; @@ -218,11 +181,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens : "-DSATURATE"); build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte"); - if (is_data_type_fixed_point(input1->info()->data_type())) - { - build_opts.emplace("-DFIXED_POINT_POSITION=" + - support::cpp11::to_string(input1->info()->fixed_point_position())); - } build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); @@ -245,7 +203,7 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens _kernel.setArg(idx++, scale); } - ICLKernel::configure(win_config.second); + ICLKernel::configure_internal(win_config.second); } Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp deleted file mode 100644 index 168b246bf..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) -{ - // We can handle for simple case only - // Input rank: 2 - // Output rank: 1 - // Axis: one axis value, restrict to 1 - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Inputs are not broadcast compatible"); - - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(), - "Output same type allowed for input and output"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1, - "Only support for output dimension 1"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2, - "Only support for input dimension 2"); - } - - return Status{}; -} - -} // namespace - -CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {} - -void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info())); - - _input = input; - _output = output; - _axis = axis; - - // Configure kernel window - int cols = _input->info()->tensor_shape()[0]; - int rows = _input->info()->tensor_shape()[1]; - Window win; - win.set(0, Window::Dimension(0, cols, 1)); - win.set(1, Window::Dimension(0, rows, 1)); - - // Construct kernel name - std::string kernel_name = "reduce_max"; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols)); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - ICLKernel::configure(win); -} - -Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis, - const ITensorInfo *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output)); - - return Status{}; -} - -void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window window_input = window; - Window slice_input = window_input.first_slice_window_1D(); - - do - { - Window slice_output = slice_input.shift_dimensions(1); - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, slice_input); - add_1D_tensor_argument(idx, _output, slice_output); - enqueue(queue, *this, slice_input); - - } while (window_input.slide_window_slice_1D(slice_input)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp new file mode 100644 index 000000000..f581780e1 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; +namespace +{ +// NOTE This is necessary because it is not guaranteed that the axis positions of input and output +// are the same. +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32, DataType::S32); + if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, + "Not support QASYMM8, yet"); + } + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + axis >= 0 && axis < num_dimensions, + "axis must be greater than or equal to 0 and less than (input's rank)."); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + return Status{}; +} +} // namespace + +CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel name + std::string kernel_name; + int op_code = 0; + if (op == ReduceOperation::MAX) + { + kernel_name = "reduce_min_max"; + op_code = 1; + } + else if (op == ReduceOperation::MIN) + { + kernel_name = "reduce_min_max"; + op_code = 2; + } + else if (op == ReduceOperation::SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 3; + } + else if (op == ReduceOperation::MEAN) + { + kernel_name = "reduce_sum_mean"; + op_code = 4; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + // Support dimensions up to 4 + Window slice_out = window.collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions + // of input and output are the same + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp deleted file mode 100644 index 84a77122d..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/FixedPoint.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions, - "Reduction axis greater than max number of dimensions"); - - std::vector<uint32_t>::const_iterator it; - bool axis_w = false; - bool axis_h = false; - for (it = axis.begin(); it != axis.end(); ++it) - { - if ((*it) == 0) - { - axis_w = true; - } - else if ((*it) == 1) - { - axis_h = true; - } - else - { - ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); - } - } - // TODO Other axises (currently, only axises for both width and height are supported.) - if (!axis_w || !axis_h) - { - ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); - } - - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW); - } - - return Status{}; -} - -std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, - std::vector<uint32_t> axis) -{ - // Output tensor auto initialization if not yet initialized - TensorShape output_shape{input->tensor_shape()}; - output_shape.set(0, 1); - output_shape.set(1, 1); - auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(), - input->fixed_point_position()); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step - const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); - - Window win = calculate_max_window( - *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, - num_elems_processed_per_iteration_y); - AccessWindowHorizontal output_access(output, 0, 1); - bool window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, output->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - - return std::make_tuple(err, win); -} -} // namespace - -CLReductionMeanKernel::CLReductionMeanKernel() - : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size() -{ -} - -BorderSize CLReductionMeanKernel::border_size() const { return _border_size; } - -void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis)); - - _input = input; - _output = output; - _reduction_axis = axis; - - constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step - - // Set border size - _border_size = BorderSize( - ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - - input->info()->dimension(0)); - - // Set build options - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - // build_opts.emplace(("-DVEC_SIZE=" + - // support::cpp11::to_string(num_elems_processed_per_iteration))); - if (is_data_type_fixed_point(input->info()->data_type())) - { - build_opts.emplace("-DFIXED_POINT_POSITION=" + - support::cpp11::to_string(input->info()->fixed_point_position())); - } - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts)); - - // Configure kernel window - auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - ICLKernel::configure(std::get<1>(win_config)); -} - -Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>( - validate_and_configure_window(input->clone().get(), output->clone().get(), axis))); - - return Status{}; -} - -void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - // Set out window - Window out_window(window); - out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - - // Get first input and output slices - Window in_slice = window.first_slice_window_2D(); - Window out_slice = out_window.first_slice_window_2D(); - - // Set local sums buffer - // TODO work_group - unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size(); - - unsigned int idx = 2 * num_arguments_per_2D_tensor(); - _kernel.setArg(idx++, local_sum_size, nullptr); - _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height - _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) * - _input->info()->dimension(1))); // divider - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1)); - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice); - } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp new file mode 100644 index 000000000..6b0697e89 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size, + const ITensorInfo *padding_size, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(), + "The number of dimensions of input should be equal to output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(), + "The input and output layouts are different!"); + + // TODO Support other cases + if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4, + "CLSpaceToBatchNDKernel supports dimensions up to 4"); + + if (input->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info())); + + _input = input; + _block_size = block_size; + _padding_size = padding_size; + _output = output; + + // Set kernel build options + // TODO Support other cases + std::string kernel_name = "space_to_batch_4d"; + std::set<std::string> build_opts; + Window win; + + if (input->info()->data_layout() == DataLayout::NCHW) + { + kernel_name += "_nchw"; + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0))); + + win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + } + else if (input->info()->data_layout() == DataLayout::NHWC) + { + kernel_name += "_nhwc"; + build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + + win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->info()->valid_region()); + + if (window_changed) + { + ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!"); + } + } + else + { + ARM_COMPUTE_ERROR("Unsupported layout"); + } + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + ICLKernel::configure_internal(win); +} + +void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + +#if defined(DEBUG) + const_cast<ICLTensor *>(_block_size)->map(queue); + const_cast<ICLTensor *>(_padding_size)->map(queue); + + const size_t num_dimensions = _input->info()->num_dimensions(); + const size_t num_spacial_dimensions = _block_size->info()->dimension(0); + int32_t batch_size = _input->info()->dimension(num_dimensions - 1); + for (size_t i = 0; i < num_spacial_dimensions; ++i) + { + const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i})); + const int32_t padding_size_pre = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i})); + const int32_t padding_size_post = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i})); + + ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1"); + ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0, + "Padding size should be greater than or equal to 0"); + + if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(i) != + (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + else + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) != + (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) + + padding_size_pre + padding_size_post) / + block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + + batch_size *= block_size; + } + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - 1) != batch_size, + "Output batch size should be equal to input batch size * (multiplication of all block size)"); + + const_cast<ICLTensor *>(_block_size)->unmap(queue); + const_cast<ICLTensor *>(_padding_size)->unmap(queue); +#endif // defined(DEBUG) + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Set block size window + Window win_block = calculate_max_window(*_block_size->info(), Steps()); + + // Set padding size window + Window win_padding = calculate_max_window(*_padding_size->info(), Steps()); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + add_1D_tensor_argument(idx, _block_size, win_block); + add_2D_tensor_argument(idx, _padding_size, win_padding); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp new file mode 100644 index 000000000..5d6329edc --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3), + "Input batch should be equal to Output batch"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input->dimension(2) * block_size * block_size == output->dimension(2), + "Output depth should be equal to (input depth * block size *block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) && + !(input->dimension(1) % block_size), + "Input height and width should be divisible by block size"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) && + (output->dimension(1) == (input->dimension(1) / block_size)), + "Output height and width should be equal to " + "input_height/blocksize and input_width/blocksize respectively"); + + return Status{}; +} + +} // namespace + +CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp new file mode 100644 index 000000000..260bc39f1 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLSquaredDifferenceKernel::CLSquaredDifferenceKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input1->info()->data_type() == DataType::F16 && + input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input1->info()->data_type() == DataType::F32 || + input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLSquaredDifferenceKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp index 80ffd423a..48146a43a 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp @@ -14,43 +14,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" +#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include <string> - -using namespace std; using namespace arm_compute; -static const int32_t maxDim = 4; - -CLStridedSliceKernel::CLStridedSliceKernel() +CLStridedSliceExKernel::CLStridedSliceExKernel() : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0) { } -Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *begin, const ITensorInfo *end, - const ITensorInfo *strides, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) +Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *begin, const ITensorInfo *end, + const ITensorInfo *strides, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, - DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32); @@ -153,15 +140,6 @@ inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, return stop; } -inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) -{ - int32_t offset = b * shape[2] * shape[1] * shape[0]; - offset += d * shape[1] * shape[0]; - offset += h * shape[0]; - offset += w; - return offset; -} - inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) { int32_t ret = 0; @@ -177,10 +155,10 @@ inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) return ret; } -void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, - ICLTensor *beginData, ICLTensor *endData, - ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask) +void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output, + ICLTensor *beginData, ICLTensor *endData, + ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), beginMask, endMask, @@ -195,48 +173,31 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, _endMask = endMask; _shrinkAxisMask = shrinkAxisMask; - constexpr unsigned int num_elems_processed_per_iteration = 1; - // Set kernel build options std::set<std::string> build_opts; build_opts.emplace("-DELEMENT_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts)); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts)); - // Create output's window without padding - TensorShape collapsed = output->info()->tensor_shape(); - collapsed.collapse(4); - TensorInfo info = *output->info(); - info.set_tensor_shape(collapsed); - Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration)); - - ICLKernel::configure(win); + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + ICLKernel::configure_internal(win); } -void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) +void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - // Create input window - TensorShape collapsed = _input->info()->tensor_shape(); - collapsed.collapse(4); - TensorInfo info = *_input->info(); - info.set_tensor_shape(collapsed); - Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size())); - _beginData->map(queue); _endData->map(queue); _stridesData->map(queue); - std::vector<int32_t> dimsIn; - std::vector<int32_t> dimsOut; std::vector<int32_t> starts; - std::vector<int32_t> stops; std::vector<int32_t> strides; for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n) @@ -246,22 +207,13 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n], reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n)); - stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n], - reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, - n)); - strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]); - dimsIn.emplace_back(shape[n]); - dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n])); } for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) { starts.emplace_back(0); - stops.emplace_back(1); strides.emplace_back(1); - dimsIn.emplace_back(1); - dimsOut.emplace_back(1); } // TODO: Apply shrinkAxisMask @@ -269,20 +221,7 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) _stridesData->unmap(queue); _endData->unmap(queue); - // Set parameters - unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters - const cl_int4 dimsInArg = {{ - static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]), - static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]), - }}; - _kernel.setArg<cl_int4>(idx++, dimsInArg); - - const cl_int4 dimsOutArg = {{ - static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]), - static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]), - }}; - _kernel.setArg<cl_int4>(idx++, dimsOutArg); - + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters const cl_int4 startsArg = {{ static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]), static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]), @@ -295,10 +234,20 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) }}; _kernel.setArg<cl_int4>(idx++, stridesArg); - // TODO: Apply slicing output's window - idx = 0; - add_1D_tensor_argument(idx, _input, win_in); - add_1D_tensor_argument(idx, _output, window); + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); - enqueue(queue, *this, window); + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); } diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp index d95b485b7..073c2f7bb 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -17,15 +17,8 @@ #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <climits> -#include <cassert> namespace arm_compute { @@ -59,7 +52,7 @@ void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTens // Configure kernel window Window win; win.set(0, Window::Dimension(0, 1, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) @@ -102,7 +95,7 @@ void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffe // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) @@ -147,7 +140,7 @@ void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) // Configure kernel window Window win; win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -192,7 +185,7 @@ void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_ // Configure kernel window Window win; win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -236,7 +229,7 @@ void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buf // Configure kernel window Window win; win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -275,7 +268,7 @@ void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob // Configure kernel window Window win; win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -322,7 +315,7 @@ void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) // Configure kernel window Window win; win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) @@ -365,7 +358,7 @@ void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, in // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) @@ -404,7 +397,7 @@ void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) @@ -449,7 +442,7 @@ void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int // Configure kernel window Window win; win.set(0, Window::Dimension(0, k, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) |