diff options
Diffstat (limited to 'runtimes/libs/ARMComputeEx/src/core/CL/kernels')
14 files changed, 2470 insertions, 0 deletions
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp new file mode 100644 index 000000000..7f4b5b0df --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ArgOperation /*op*/) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) != + output->tensor_shape().num_dimensions(), + "Input's rank is not same with output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); + return Status{}; +} + +} // namespace + +CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel and set op_code based on type of ArgOperation as specified by object op + std::string kernel_name = "arg_op"; + int op_code = 0; + if (op == ArgOperation::MAX) + { + op_code = 1; + } + else if (op == ArgOperation::MIN) + { + op_code = 2; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp new file mode 100644 index 000000000..c14e73634 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, BinaryLogicalOperation op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "binary_logical_op"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + + int op_code = 0; + switch (op) + { + case BinaryLogicalOperation::AND: + op_code = 1; + break; + case BinaryLogicalOperation::OR: + op_code = 2; + break; + default: + throw std::runtime_error("Operation not supported, yet"); + } + + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const ValidRegion &valid_region = broadcast_pair.second; + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLBinaryLogicalOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp new file mode 100644 index 000000000..ac2963f38 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLCastKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {} + +void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + // Create kernel + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + const float scale_in = input->info()->quantization_info().scale; + const int offset_in = input->info()->quantization_info().offset; + build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts)); + } + else if (is_data_type_quantized_asymmetric(output->info()->data_type())) + { + const float scale_in = output->info()->quantization_info().scale; + const int offset_in = output->info()->quantization_info().offset; + build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts)); + } + else + { + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts)); + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp new file mode 100644 index 000000000..2a3433c2b --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +// TODO Use this validation function +#if 0 +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size, + "Output width should be equal to (Input width * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size, + "Output height should be equal to (Input height * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0, + "Input depth should be divisible by (block size * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(2) != input->dimension(2) / (block_size * block_size), + "Output depth should be equal to (Input depth / (block size * block size))"); + + return Status{}; +} +#endif +} // namespace + +CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + // TODO Add validation of data_layout + _input = input; + _output = output; + + // Set kernel build options + auto layout_out = output->info()->data_layout(); + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto depth = output->info()->dimension(index_depth); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth)); + build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..0862b78bf --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + return Status{}; +} + +void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp new file mode 100644 index 000000000..c83ece0e9 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/UtilsEx.h" + +using namespace arm_compute; + +namespace +{ + +inline TensorShape compute_gather_shape(const TensorShape &input_shape, + const TensorShape &indices_shape, uint32_t actual_axis) +{ + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4); + ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions()); + + TensorShape output_shape = input_shape; + if (indices_shape.num_dimensions() == 1) + { + output_shape[actual_axis] = indices_shape[0]; + } + else if (indices_shape.num_dimensions() > 1) + { + output_shape.shift_right(indices_shape.num_dimensions() - 1); + + for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i) + { + if (o == actual_axis) + { + ++i; + for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o) + { + output_shape[o] = indices_shape[in]; + } + } + else + { + output_shape[o] = input_shape[i]; + } + } + } + return output_shape; +} + +/** Wrap-around a number within the range 0 <= x < m + * + * @param[in] x Input value + * @param[in] m Range + * + * @return the wrapped-around number + */ +template <typename T> inline T wrap_around(T x, T m) { return x >= 0 ? x % m : (x % m + m) % m; } + +inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = + compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, + ITensorInfo *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + std::unique_ptr<ITensorInfo> output_info = input->clone(); + output_info->set_tensor_shape( + compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis)); + // Output auto initialization if not yet initialized + auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); + + // Create window + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} + +} // namespace + +CLGatherExKernel::CLGatherExKernel() + : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) +{ +} + +void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices, + ICLTensor *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), indices->info(), output->info(), axis)); + + // Configure kernel window + auto win_config = + validate_and_configure_window(input->info(), indices->info(), output->info(), axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input = input; + _output = output; + _indices = indices; + _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions())); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DOUTPUT_DIM_Z=" + + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis)); + build_opts.add_option("-DINDICES_DIM=" + + support::cpp11::to_string(indices->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); + ICLKernel::configure_internal(win_config.second); +} + +Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + indices->clone().get(), + output->clone().get(), axis) + .first); + return Status{}; +} + +void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4); + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, window_collapsed); + add_3D_tensor_argument(idx, _indices, window_collapsed); + add_4D_tensor_argument(idx, _output, window_collapsed); + enqueue(queue, *this, window_collapsed, lws_hint()); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp new file mode 100644 index 000000000..31e98c9a8 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLHashtableLookupKernel::CLHashtableLookupKernel() +{ + // DO NOTHING +} + +Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Output's shape was not set"); + + ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) || + output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + + return Status{}; +} + +void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Make _lookup_indices tensor + _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const_cast<ICLTensor *>(_lookups)->map(queue); + const_cast<ICLTensor *>(_keys)->map(queue); + _hits->map(queue); + _lookup_indices->map(queue); + + // Set values of hits + const int32_t *lookups_buf = + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); + + std::map<int32_t, size_t> key_map; + const size_t keys_num = _keys->info()->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const size_t lookups_num = _lookups->info()->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= lookups_num) + ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices_buf[i] = static_cast<int32_t>(it->second); + hits_buf[i] = static_cast<uint8_t>(1); + } + else + { + lookup_indices_buf[i] = -1; + hits_buf[i] = static_cast<uint8_t>(0); + } + } + + const_cast<ICLTensor *>(_lookups)->unmap(queue); + const_cast<ICLTensor *>(_keys)->unmap(queue); + _hits->unmap(queue); + _lookup_indices->unmap(queue); + + Window win = window.collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp new file mode 100644 index 000000000..ecfe05a51 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + return Status{}; +} + +} // namespace + +CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} + +void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp new file mode 100644 index 000000000..e7d587029 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info())); + + _input = input; + _alpha = alpha; + _output = output; + + // Create kernel + std::string kernel_name = "prelu"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + build_opts.emplace("-DOFF_IN=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_ALPHA=" + + support::cpp11::to_string(alpha->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN=" + + support::cpp11::to_string(input->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_ALPHA=" + + support::cpp11::to_string(alpha->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input->info()->data_type() == DataType::F32 || + alpha->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); + + AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input->info()->tensor_shape(); + const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_input1); + add_3D_tensor_argument(idx, _alpha, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLPReLUKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp new file mode 100644 index 000000000..24e89db28 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; +namespace +{ +// NOTE This is necessary because it is not guaranteed that the axis positions of input and output +// are the same. +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32, DataType::S32); + if (op == ReduceOperation::SUM) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, + "Not support QASYMM8, yet"); + } + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + return Status{}; +} +} // namespace + +CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel name + std::string kernel_name; + int op_code = 0; + if (op == ReduceOperation::MAX) + { + kernel_name = "reduce_min_max"; + op_code = 1; + } + else if (op == ReduceOperation::MIN) + { + kernel_name = "reduce_min_max"; + op_code = 2; + } + else if (op == ReduceOperation::SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 3; + } + else if (op == ReduceOperation::MEAN) + { + kernel_name = "reduce_sum_mean"; + op_code = 4; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + // Support dimensions up to 4 + Window slice_out = window.collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions + // of input and output are the same + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp new file mode 100644 index 000000000..f7836b6cd --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size, + const ITensorInfo *padding_size, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(), + "The number of dimensions of input should be equal to output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(), + "The input and output layouts are different!"); + + // TODO Support other cases + if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4, + "CLSpaceToBatchNDKernel supports dimensions up to 4"); + + if (input->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() +{ + // DO NOTHING +} + +void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info())); + + _input = input; + _block_size = block_size; + _padding_size = padding_size; + _output = output; + + // Set kernel build options + // TODO Support other cases + std::string kernel_name = "space_to_batch_4d"; + std::set<std::string> build_opts; + Window win; + + if (input->info()->data_layout() == DataLayout::NCHW) + { + kernel_name += "_nchw"; + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0))); + + win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + } + else if (input->info()->data_layout() == DataLayout::NHWC) + { + kernel_name += "_nhwc"; + build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + + win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->info()->valid_region()); + + if (window_changed) + { + ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!"); + } + } + else + { + ARM_COMPUTE_ERROR("Unsupported layout"); + } + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + ICLKernel::configure_internal(win); +} + +void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + const_cast<ICLTensor *>(_block_size)->map(queue); + const_cast<ICLTensor *>(_padding_size)->map(queue); + + const size_t num_dimensions = _input->info()->num_dimensions(); + const size_t num_spacial_dimensions = _block_size->info()->dimension(0); + uint32_t batch_size = _input->info()->dimension(num_dimensions - 1); + for (size_t i = 0; i < num_spacial_dimensions; ++i) + { + const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i})); + const int32_t padding_size_pre = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i})); + const int32_t padding_size_post = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i})); + + ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1"); + ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0, + "Padding size should be greater than or equal to 0"); + + if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(i) != + (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + else + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) != + (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) + + padding_size_pre + padding_size_post) / + block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + + batch_size *= block_size; + } + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - 1) != batch_size, + "Output batch size should be equal to input batch size * (multiplication of all block size)"); + + const_cast<ICLTensor *>(_block_size)->unmap(queue); + const_cast<ICLTensor *>(_padding_size)->unmap(queue); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Set block size window + Window win_block = calculate_max_window(*_block_size->info(), Steps()); + + // Set padding size window + Window win_padding = calculate_max_window(*_padding_size->info(), Steps()); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + add_1D_tensor_argument(idx, _block_size, win_block); + add_2D_tensor_argument(idx, _padding_size, win_padding); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp new file mode 100644 index 000000000..b085192a2 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3), + "Input batch should be equal to Output batch"); + + auto layout_out = input->data_layout(); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT); + auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth), + "Output depth should be equal to (input depth * block size *block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) || + (input->dimension(index_height) % block_size), + "Input height and width should be divisible by block size"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (output->dimension(index_width) != (input->dimension(index_width) / block_size)) || + (output->dimension(index_height) != (input->dimension(index_height) / block_size)), + "Output height and width should be equal to " + "input_height/blocksize and input_width/blocksize respectively"); + + return Status{}; +} + +} // namespace + +CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + auto layout_out = input->info()->data_layout(); + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto depth = input->info()->dimension(index_depth); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth)); + build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp new file mode 100644 index 000000000..073c2f7bb --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +namespace arm_compute +{ +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {} + +void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + _topk_values = topk_values; + _topk_indices = topk_indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts)); + + unsigned int idx = 3 * num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *indices); + _kernel.setArg(idx++, *temp_stack); + _kernel.setArg<cl_int>(idx++, k); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, 1, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + add_1D_tensor_argument(idx, _topk_values, window); + add_1D_tensor_argument(idx, _topk_indices, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {} + +void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, + int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); + ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts)); + + unsigned int idx = num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *in_key_buf); + _kernel.setArg(idx++, *in_ind_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +// This kernel makes a histogram of radix for each work item. +CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {} + +void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts)); + + int loc_histo_size = radix * _ITEMS * sizeof(cl_int); + + unsigned int idx = 1; + _kernel.setArg(idx++, *hist_buf); + + idx = 3; + _kernel.setArg(idx++, loc_histo_size, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg<cl_int>(2, _pass); + + cl::NDRange lws = cl::NDRange(_ITEMS, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortScanHistogram::CLRadixSortScanHistogram() {} + +void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {} + +void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, + int bits) +{ + ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *glob_sum_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *temp_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {} + +void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts)); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortReorder::CLRadixSortReorder() + : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), + _out_ind_buf(nullptr) +{ +} + +void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts)); + + unsigned int idx = 2; + _kernel.setArg(idx++, *hist_buf); + + idx = 6; + _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); + cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg(1, *_out_key_buf); + _kernel.setArg<cl_int>(3, _pass); + _kernel.setArg(4, *_in_ind_buf); + _kernel.setArg(5, *_out_ind_buf); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {} + +void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts)); + + unsigned int idx = 1; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_out_key_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() + : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts)); + + unsigned int idx = 4; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_in_key_buf); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_in_ind_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Store::CLTopKV2Store() + : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(k == 0); + ARM_COMPUTE_ERROR_ON(k > n); + + _values = values; + _indices = indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts)); + + unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, k, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) +{ + _out_key_buf = out_key_buf; + _out_ind_buf = out_ind_buf; +} + +void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _values, window); + add_1D_tensor_argument(idx, _indices, window); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +} // namespace arm_compute diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp new file mode 100644 index 000000000..6cc8d9d13 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel() + : _input(nullptr), _output(nullptr), _inner_border(), _info() +{ +} + +Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0); + + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c)); + for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1, + "inner_border_right must be smaller that stride_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1, + "inner_border_top must be smaller that stride_y"); + + return Status{}; +} + +void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _input = input; + _output = output; + _inner_border = inner_border; + _info = info; + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate( + input->info(), output->info(), inner_border, info)); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + _kernel = static_cast<cl::Kernel>( + CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options())); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const DataLayout data_layout = _input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + const int out_start_x = _info.pad_left(); + const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right - + _info.pad_right() + _info.stride().first - 1; + const int out_step_x = _info.stride().first; + + const int out_start_y = _inner_border.top + _info.pad_top(); + const int out_end_y = + _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1; + const int out_step_y = _info.stride().second; + + switch (data_layout) + { + case DataLayout::NCHW: + { + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + + Window slice_out = collapsed.first_slice_window_3D(); + slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x)); + slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y)); + + Window slice_in = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (collapsed.slide_window_slice_3D(slice_in) && + collapsed.slide_window_slice_3D(slice_out)); + break; + } + case DataLayout::NHWC: + { + // NOTE: not collapsing in NHWC + Window slice_out = window.first_slice_window_3D(); + slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x)); + slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y)); + + Window slice_in = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data layout"); + } +} |