summaryrefslogtreecommitdiff
path: root/runtimes/libs/ARMComputeEx/src/core/CL/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'runtimes/libs/ARMComputeEx/src/core/CL/kernels')
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp157
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp172
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp102
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp116
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp114
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp181
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp178
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp88
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp186
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp179
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp241
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp124
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp468
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp164
14 files changed, 2470 insertions, 0 deletions
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
new file mode 100644
index 000000000..7f4b5b0df
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(axis, 1);
+
+ return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ArgOperation /*op*/)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
+ DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
+ output->tensor_shape().num_dimensions(),
+ "Input's rank is not same with output");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match axis");
+
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+ return Status{};
+}
+
+} // namespace
+
+CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+ ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+ // Construct kernel and set op_code based on type of ArgOperation as specified by object op
+ std::string kernel_name = "arg_op";
+ int op_code = 0;
+ if (op == ArgOperation::MAX)
+ {
+ op_code = 1;
+ }
+ else if (op == ArgOperation::MIN)
+ {
+ op_code = 2;
+ }
+ else
+ throw std::runtime_error("Operation not supported, yet");
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+ build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t axis, ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+ return Status{};
+}
+
+void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ _kernel.setArg<cl_int>(idx++, _axis);
+ _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
new file mode 100644
index 000000000..c14e73634
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, BinaryLogicalOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "binary_logical_op";
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+
+ int op_code = 0;
+ switch (op)
+ {
+ case BinaryLogicalOperation::AND:
+ op_code = 1;
+ break;
+ case BinaryLogicalOperation::OR:
+ op_code = 2;
+ break;
+ default:
+ throw std::runtime_error("Operation not supported, yet");
+ }
+
+ build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLBinaryLogicalOpKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
new file mode 100644
index 000000000..ac2963f38
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ // Create kernel
+ if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ const float scale_in = input->info()->quantization_info().scale;
+ const int offset_in = input->info()->quantization_info().offset;
+ build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+ build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
+
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
+ }
+ else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
+ {
+ const float scale_in = output->info()->quantization_info().scale;
+ const int offset_in = output->info()->quantization_info().offset;
+ build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+ build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
+
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
+ }
+ else
+ {
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts));
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
new file mode 100644
index 000000000..2a3433c2b
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+// TODO Use this validation function
+#if 0
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
+ "Output width should be equal to (Input width * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
+ "Output height should be equal to (Input height * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
+ "Input depth should be divisible by (block size * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->dimension(2) != input->dimension(2) / (block_size * block_size),
+ "Output depth should be equal to (Input depth / (block size * block size))");
+
+ return Status{};
+}
+#endif
+} // namespace
+
+CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+{
+ // DO NOTHING
+}
+
+void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t block_size)
+{
+ // TODO Add validation of data_layout
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ auto layout_out = output->info()->data_layout();
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+ auto depth = output->info()->dimension(index_depth);
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
+ build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+ "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..0862b78bf
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
+ : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *lookups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+ return Status{};
+}
+
+void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const ICLTensor *lookups)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+ _input = input;
+ _output = output;
+ _lookups = lookups;
+
+ // Set kernel build options
+ std::stringstream kernel_name;
+ std::set<std::string> build_opts;
+ kernel_name << "embedding_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ Window win_lookup;
+ win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_in);
+ add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
new file mode 100644
index 000000000..c83ece0e9
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/UtilsEx.h"
+
+using namespace arm_compute;
+
+namespace
+{
+
+inline TensorShape compute_gather_shape(const TensorShape &input_shape,
+ const TensorShape &indices_shape, uint32_t actual_axis)
+{
+ ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
+ ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
+ ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
+
+ TensorShape output_shape = input_shape;
+ if (indices_shape.num_dimensions() == 1)
+ {
+ output_shape[actual_axis] = indices_shape[0];
+ }
+ else if (indices_shape.num_dimensions() > 1)
+ {
+ output_shape.shift_right(indices_shape.num_dimensions() - 1);
+
+ for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
+ {
+ if (o == actual_axis)
+ {
+ ++i;
+ for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
+ {
+ output_shape[o] = indices_shape[in];
+ }
+ }
+ else
+ {
+ output_shape[o] = input_shape[i];
+ }
+ }
+ }
+ return output_shape;
+}
+
+/** Wrap-around a number within the range 0 <= x < m
+ *
+ * @param[in] x Input value
+ * @param[in] m Range
+ *
+ * @return the wrapped-around number
+ */
+template <typename T> inline T wrap_around(T x, T m) { return x >= 0 ? x % m : (x % m + m) % m; }
+
+inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+ ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ TensorShape output_shape =
+ compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices,
+ ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+ std::unique_ptr<ITensorInfo> output_info = input->clone();
+ output_info->set_tensor_shape(
+ compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis));
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
+
+ // Create window
+ Window win = calculate_max_window(*output, Steps());
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+
+} // namespace
+
+CLGatherExKernel::CLGatherExKernel()
+ : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+{
+}
+
+void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices,
+ ICLTensor *output, int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), indices->info(), output->info(), axis));
+
+ // Configure kernel window
+ auto win_config =
+ validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input = input;
+ _output = output;
+ _indices = indices;
+ _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions()));
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DOUTPUT_DIM_Z=" +
+ support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+ build_opts.add_option("-DINDICES_DIM=" +
+ support::cpp11::to_string(indices->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ indices->clone().get(),
+ output->clone().get(), axis)
+ .first);
+ return Status{};
+}
+
+void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4);
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window_collapsed);
+ add_3D_tensor_argument(idx, _indices, window_collapsed);
+ add_4D_tensor_argument(idx, _output, window_collapsed);
+ enqueue(queue, *this, window_collapsed, lws_hint());
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
new file mode 100644
index 000000000..31e98c9a8
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLHashtableLookupKernel::CLHashtableLookupKernel()
+{
+ // DO NOTHING
+}
+
+Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Output's shape was not set");
+
+ ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) ||
+ output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+
+ return Status{};
+}
+
+void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
+ const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+
+ _lookups = lookups;
+ _keys = keys;
+ _input = input;
+ _output = output;
+ _hits = hits;
+
+ // Make _lookup_indices tensor
+ _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+ _lookup_indices->allocator()->init(
+ TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+ _lookup_indices->allocator()->allocate();
+
+ // Set kernel build options
+ std::stringstream kernel_name;
+ std::set<std::string> build_opts;
+ kernel_name << "hashtable_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const_cast<ICLTensor *>(_lookups)->map(queue);
+ const_cast<ICLTensor *>(_keys)->map(queue);
+ _hits->map(queue);
+ _lookup_indices->map(queue);
+
+ // Set values of hits
+ const int32_t *lookups_buf =
+ reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+ const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
+ uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+ int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
+
+ std::map<int32_t, size_t> key_map;
+ const size_t keys_num = _keys->info()->dimension(0);
+ for (size_t key_index = 0; key_index < keys_num; key_index++)
+ {
+ key_map[keys_buf[key_index]] = key_index;
+ }
+
+ const size_t lookups_num = _lookups->info()->dimension(0);
+ for (size_t i = 0; i < lookups_num; ++i)
+ {
+ const auto lookup_value = lookups_buf[i];
+ const auto it = key_map.find(lookup_value);
+ if (it != key_map.end())
+ {
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+ if (it->second >= lookups_num)
+ ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+ lookup_indices_buf[i] = static_cast<int32_t>(it->second);
+ hits_buf[i] = static_cast<uint8_t>(1);
+ }
+ else
+ {
+ lookup_indices_buf[i] = -1;
+ hits_buf[i] = static_cast<uint8_t>(0);
+ }
+ }
+
+ const_cast<ICLTensor *>(_lookups)->unmap(queue);
+ const_cast<ICLTensor *>(_keys)->unmap(queue);
+ _hits->unmap(queue);
+ _lookup_indices->unmap(queue);
+
+ Window win = window.collapse(ICLKernel::window(), 2, 4);
+
+ Window win_lookup;
+ win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, win);
+ add_4D_tensor_argument(idx, _output, win);
+ add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+ enqueue(queue, *this, win);
+ } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
new file mode 100644
index 000000000..ecfe05a51
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ return Status{};
+}
+
+} // namespace
+
+CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
new file mode 100644
index 000000000..e7d587029
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
+
+ _input = input;
+ _alpha = alpha;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "prelu";
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ build_opts.emplace("-DOFF_IN=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ build_opts.emplace("-DOFF_ALPHA=" +
+ support::cpp11::to_string(alpha->info()->quantization_info().offset));
+ build_opts.emplace("-DOFF_OUT=" +
+ support::cpp11::to_string(output->info()->quantization_info().offset));
+ build_opts.emplace("-DSCALE_IN=" +
+ support::cpp11::to_string(input->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_ALPHA=" +
+ support::cpp11::to_string(alpha->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_OUT=" +
+ support::cpp11::to_string(output->info()->quantization_info().scale));
+ kernel_name += "_qasymm8";
+ }
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), out_shape);
+
+ if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if (input->info()->data_type() == DataType::F32 ||
+ alpha->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+
+ AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input->info()->tensor_shape();
+ const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_input1);
+ add_3D_tensor_argument(idx, _alpha, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPReLUKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
new file mode 100644
index 000000000..24e89db28
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+namespace
+{
+// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
+// are the same.
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(axis, 1);
+
+ return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32, DataType::S32);
+ if (op == ReduceOperation::SUM)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
+ "Not support QASYMM8, yet");
+ }
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match axis");
+
+ return Status{};
+}
+} // namespace
+
+CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const uint32_t axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+ // Construct kernel name
+ std::string kernel_name;
+ int op_code = 0;
+ if (op == ReduceOperation::MAX)
+ {
+ kernel_name = "reduce_min_max";
+ op_code = 1;
+ }
+ else if (op == ReduceOperation::MIN)
+ {
+ kernel_name = "reduce_min_max";
+ op_code = 2;
+ }
+ else if (op == ReduceOperation::SUM)
+ {
+ kernel_name = "reduce_sum_mean";
+ op_code = 3;
+ }
+ else if (op == ReduceOperation::MEAN)
+ {
+ kernel_name = "reduce_sum_mean";
+ op_code = 4;
+ }
+ else
+ throw std::runtime_error("Operation not supported, yet");
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+ build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+ return Status{};
+}
+
+void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ _kernel.setArg<cl_int>(idx++, _axis);
+ _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+ // Support dimensions up to 4
+ Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
+ // of input and output are the same
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+ idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out, lws_hint());
+
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
new file mode 100644
index 000000000..f7836b6cd
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
+ const ITensorInfo *padding_size, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
+ "The number of dimensions of input should be equal to output");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
+ "The input and output layouts are different!");
+
+ // TODO Support other cases
+ if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
+ else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
+ "CLSpaceToBatchNDKernel supports dimensions up to 4");
+
+ if (input->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
+ "The input and output quantization info are different!");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel()
+{
+ // DO NOTHING
+}
+
+void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
+ const ICLTensor *padding_size, ICLTensor *output)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
+
+ _input = input;
+ _block_size = block_size;
+ _padding_size = padding_size;
+ _output = output;
+
+ // Set kernel build options
+ // TODO Support other cases
+ std::string kernel_name = "space_to_batch_4d";
+ std::set<std::string> build_opts;
+ Window win;
+
+ if (input->info()->data_layout() == DataLayout::NCHW)
+ {
+ kernel_name += "_nchw";
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+ win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+ }
+ else if (input->info()->data_layout() == DataLayout::NHWC)
+ {
+ kernel_name += "_nhwc";
+ build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DVEC_SIZE=" +
+ support::cpp11::to_string(num_elems_processed_per_iteration));
+
+ win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->info()->valid_region());
+
+ if (window_changed)
+ {
+ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported layout");
+ }
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
+ if (input->info()->data_type() == DataType::QASYMM8)
+ {
+ build_opts.emplace("-DZERO_VALUE=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ }
+ else
+ {
+ build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ }
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+ const_cast<ICLTensor *>(_block_size)->map(queue);
+ const_cast<ICLTensor *>(_padding_size)->map(queue);
+
+ const size_t num_dimensions = _input->info()->num_dimensions();
+ const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
+ uint32_t batch_size = _input->info()->dimension(num_dimensions - 1);
+ for (size_t i = 0; i < num_spacial_dimensions; ++i)
+ {
+ const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
+ const int32_t padding_size_pre =
+ *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
+ const int32_t padding_size_post =
+ *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
+
+ ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
+ ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
+ "Padding size should be greater than or equal to 0");
+
+ if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(i) !=
+ (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
+ "Dimension value of spatial block does not match output's dimension value");
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
+ (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
+ padding_size_pre + padding_size_post) /
+ block_size,
+ "Dimension value of spatial block does not match output's dimension value");
+ }
+
+ batch_size *= block_size;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(num_dimensions - 1) != batch_size,
+ "Output batch size should be equal to input batch size * (multiplication of all block size)");
+
+ const_cast<ICLTensor *>(_block_size)->unmap(queue);
+ const_cast<ICLTensor *>(_padding_size)->unmap(queue);
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Set block size window
+ Window win_block = calculate_max_window(*_block_size->info(), Steps());
+
+ // Set padding size window
+ Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ add_1D_tensor_argument(idx, _block_size, win_block);
+ add_2D_tensor_argument(idx, _padding_size, win_padding);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
new file mode 100644
index 000000000..b085192a2
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
+ "Input batch should be equal to Output batch");
+
+ auto layout_out = input->data_layout();
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+
+ auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+ auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
+ auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
+ "Output depth should be equal to (input depth * block size *block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
+ (input->dimension(index_height) % block_size),
+ "Input height and width should be divisible by block size");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
+ (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
+ "Output height and width should be equal to "
+ "input_height/blocksize and input_width/blocksize respectively");
+
+ return Status{};
+}
+
+} // namespace
+
+CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t block_size)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ auto layout_out = input->info()->data_layout();
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+ auto depth = input->info()->dimension(index_depth);
+ build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
+ build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+ "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
new file mode 100644
index 000000000..073c2f7bb
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+namespace arm_compute
+{
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
+
+void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+ cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
+{
+ ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ _input = input;
+ _topk_values = topk_values;
+ _topk_indices = topk_indices;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
+
+ unsigned int idx = 3 * num_arguments_per_1D_tensor();
+ _kernel.setArg(idx++, *indices);
+ _kernel.setArg(idx++, *temp_stack);
+ _kernel.setArg<cl_int>(idx++, k);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, 1, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, window);
+ add_1D_tensor_argument(idx, _topk_values, window);
+ add_1D_tensor_argument(idx, _topk_indices, window);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
+
+void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
+ int n)
+{
+ ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ _input = input;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
+
+ unsigned int idx = num_arguments_per_1D_tensor();
+ _kernel.setArg(idx++, *in_key_buf);
+ _kernel.setArg(idx++, *in_ind_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, window);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This kernel makes a histogram of radix for each work item.
+CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
+
+void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
+
+ int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
+
+ unsigned int idx = 1;
+ _kernel.setArg(idx++, *hist_buf);
+
+ idx = 3;
+ _kernel.setArg(idx++, loc_histo_size, nullptr);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ _kernel.setArg(0, *_in_key_buf);
+ _kernel.setArg<cl_int>(2, _pass);
+
+ cl::NDRange lws = cl::NDRange(_ITEMS, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
+
+void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+ int temp_size =
+ std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *hist_buf);
+ _kernel.setArg(idx++, temp_size, nullptr);
+ _kernel.setArg(idx++, *glob_sum_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
+
+void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
+ int bits)
+{
+ ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+ int temp_size =
+ std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *glob_sum_buf);
+ _kernel.setArg(idx++, temp_size, nullptr);
+ _kernel.setArg(idx++, *temp_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
+
+void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *hist_buf);
+ _kernel.setArg(idx++, *glob_sum_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortReorder::CLRadixSortReorder()
+ : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
+ _out_ind_buf(nullptr)
+{
+}
+
+void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
+
+ unsigned int idx = 2;
+ _kernel.setArg(idx++, *hist_buf);
+
+ idx = 6;
+ _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
+ cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
+
+ _kernel.setArg(0, *_in_key_buf);
+ _kernel.setArg(1, *_out_key_buf);
+ _kernel.setArg<cl_int>(3, _pass);
+ _kernel.setArg(4, *_in_ind_buf);
+ _kernel.setArg(5, *_out_ind_buf);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
+
+void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+ ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
+
+ unsigned int idx = 1;
+ _kernel.setArg(idx++, *first_negative_idx_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *_out_key_buf);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
+ : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+ ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
+
+ unsigned int idx = 4;
+ _kernel.setArg(idx++, *first_negative_idx_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *_in_key_buf);
+ _kernel.setArg(idx++, *_out_key_buf);
+ _kernel.setArg(idx++, *_in_ind_buf);
+ _kernel.setArg(idx++, *_out_ind_buf);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Store::CLTopKV2Store()
+ : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
+{
+ ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(k == 0);
+ ARM_COMPUTE_ERROR_ON(k > n);
+
+ _values = values;
+ _indices = indices;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
+
+ unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, k, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
+{
+ _out_key_buf = out_key_buf;
+ _out_ind_buf = out_ind_buf;
+}
+
+void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _values, window);
+ add_1D_tensor_argument(idx, _indices, window);
+ _kernel.setArg(idx++, *_out_key_buf);
+ _kernel.setArg(idx++, *_out_ind_buf);
+
+ enqueue(queue, *this, window);
+}
+
+} // namespace arm_compute
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
new file mode 100644
index 000000000..6cc8d9d13
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
+ : _input(nullptr), _output(nullptr), _inner_border(), _info()
+{
+}
+
+Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const BorderSize &inner_border,
+ const PadStrideInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+ for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
+ "inner_border_right must be smaller that stride_x");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
+ "inner_border_top must be smaller that stride_y");
+
+ return Status{};
+}
+
+void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const BorderSize &inner_border,
+ const PadStrideInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ _input = input;
+ _output = output;
+ _inner_border = inner_border;
+ _info = info;
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
+ input->info(), output->info(), inner_border, info));
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const DataLayout data_layout = _input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ const int out_start_x = _info.pad_left();
+ const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
+ _info.pad_right() + _info.stride().first - 1;
+ const int out_step_x = _info.stride().first;
+
+ const int out_start_y = _inner_border.top + _info.pad_top();
+ const int out_end_y =
+ _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
+ const int out_step_y = _info.stride().second;
+
+ switch (data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+ Window slice_out = collapsed.first_slice_window_3D();
+ slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+ slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+ Window slice_in = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (collapsed.slide_window_slice_3D(slice_in) &&
+ collapsed.slide_window_slice_3D(slice_out));
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ // NOTE: not collapsing in NHWC
+ Window slice_out = window.first_slice_window_3D();
+ slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+ slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+ Window slice_in = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data layout");
+ }
+}