diff options
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp')
-rw-r--r-- | libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp new file mode 100644 index 000000000..48146a43a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" + +using namespace arm_compute; + +CLStridedSliceExKernel::CLStridedSliceExKernel() + : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), + _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0) +{ +} + +Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *begin, const ITensorInfo *end, + const ITensorInfo *strides, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(), + strides->tensor_shape()); + + return Status{}; +} + +// Return the index for the first element along that axis. This index will be a +// positive integer between [0, axisSize - 1] that can be used to index +// directly into the data. +inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride, + const TensorShape &inputShape, int32_t axis) +{ + // Begin with the specified index + int32_t start = begin; + + // beginMask override + if (beginMask & 1 << axis) + { + if (stride > 0) + { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axisSize-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits<int32_t>::lowest(); + } + else + { + // Backward iteration - use the last element. + start = std::numeric_limits<int32_t>::max(); + } + } + + // Handle negative indices + int32_t axisSize = inputShape[axis]; + if (start < 0) + { + start += axisSize; + } + + // Clamping + start = arm_compute::utility::clamp(start, 0, axisSize - 1); + + return start; +} + +// Return the "real" index for the end of iteration along that axis. This is an +// "end" in the traditional C sense, in that it points to one past the last +// element. ie. So if you were iterating through all elements of a 1D array of +// size 4, this function would return 4 as the stop, because it is one past the +// "real" indices of 0, 1, 2 & 3. +inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, + const TensorShape &inputShape, int32_t axis) +{ + // Begin with the specified index + int32_t stop = end; + + // endMask override + if (endMask & (1 << axis)) + { + if (stride > 0) + { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits<int32_t>::max(); + } + else + { + // Backward iteration - use the first element. + stop = std::numeric_limits<int32_t>::lowest(); + } + } + + // Handle negative indices + int32_t axisSize = inputShape[axis]; + if (stop < 0) + { + stop += axisSize; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (stride > 0) + { + // Forward iteration + stop = arm_compute::utility::clamp(stop, 0, axisSize); + } + else + { + // Backward iteration + stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); + } + + return stop; +} + +inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) +{ + int32_t ret = 0; + if (stride > 0) + { + ret = ((stop - start - 1) / stride) + 1; + } + else + { + ret = ((stop - start + 1) / stride) + 1; + } + ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number"); + return ret; +} + +void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output, + ICLTensor *beginData, ICLTensor *endData, + ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), + endData->info(), stridesData->info(), beginMask, endMask, + shrinkAxisMask)); + + _input = input; + _output = output; + _beginData = beginData; + _endData = endData; + _stridesData = stridesData; + _beginMask = beginMask; + _endMask = endMask; + _shrinkAxisMask = shrinkAxisMask; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DELEMENT_DATA_TYPE=" + + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + ICLKernel::configure_internal(win); +} + +void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _beginData->map(queue); + _endData->map(queue); + _stridesData->map(queue); + + std::vector<int32_t> starts; + std::vector<int32_t> strides; + + for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n) + { + const TensorShape shape = _input->info()->tensor_shape(); + starts.emplace_back( + StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n], + reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n)); + + strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]); + } + + for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) + { + starts.emplace_back(0); + strides.emplace_back(1); + } + // TODO: Apply shrinkAxisMask + + _beginData->unmap(queue); + _stridesData->unmap(queue); + _endData->unmap(queue); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + const cl_int4 startsArg = {{ + static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]), + static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]), + }}; + _kernel.setArg<cl_int4>(idx++, startsArg); + + const cl_int4 stridesArg = {{ + static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]), + static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]), + }}; + _kernel.setArg<cl_int4>(idx++, stridesArg); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} |