summaryrefslogtreecommitdiff
path: root/libs/ARMComputeEx/src/core/CL/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/kernels')
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp211
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp159
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp216
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp117
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp173
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp17
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp212
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp109
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp114
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp77
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp21
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp177
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp89
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp166
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp185
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp149
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp126
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp54
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp129
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp181
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp198
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp238
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp113
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp170
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp (renamed from libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp)121
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp27
26 files changed, 3042 insertions, 507 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
new file mode 100644
index 000000000..1fdd2f98f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/UtilsEx.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const ActivationLayerInfoEx &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::F16, DataType::F32);
+
+ // Checks performed when output is configured
+ if ((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ if (output != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, *input);
+ }
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
+
+ if (output != nullptr)
+ {
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+ }
+ else
+ {
+ window_changed = update_window_and_padding(
+ win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+ }
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLActivationLayerExKernel::CLActivationLayerExKernel()
+ : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output,
+ ActivationLayerInfoEx act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _run_in_place = (output == nullptr) || (output == input);
+
+ if (output != nullptr)
+ {
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+ }
+
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const DataType dt = input->info()->data_type();
+ float a_const = act_info.a();
+ float b_const = act_info.b();
+ int a_const_int = 0;
+ int b_const_int = 0;
+
+ // Create quantized version of constants a, b if needed
+ if (is_data_type_quantized(dt))
+ {
+ a_const_int =
+ input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
+ b_const_int =
+ input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
+ }
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(
+ ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation()))));
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized(dt))
+ {
+ build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+ build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+
+ const int o1 = input->info()->quantization_info().offset;
+ // Quantized value of 0 corresponds to the offset o1
+ build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
+
+ // Set scale and offset of the input and output if they have different quantization info
+ if (is_data_type_quantized_asymmetric(dt) && output != nullptr)
+ {
+ const float s1 = input->info()->quantization_info().scale;
+ const float s2 = output->info()->quantization_info().scale;
+ const int o2 = output->info()->quantization_info().offset;
+
+ if (o1 != o2 || s1 != s2)
+ {
+ build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
+ build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
+ build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+ build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+ }
+ }
+ }
+ else
+ {
+ build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+ build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+ }
+
+ build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
+
+ // Create kernel
+ std::string kernel_name = std::string("activation_layer_ex");
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Make sure _kernel is initialized before calling the parent's configure
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ auto win_config =
+ validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "activation_layer_ex_";
+ _config_id += lower_string(string_from_data_type(dt));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ActivationLayerInfoEx &act_info)
+{
+ const bool run_in_place = (output == nullptr) || (output == input);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(),
+ (run_in_place) ? nullptr : output->clone().get())
+ .first);
+
+ return Status{};
+}
+
+void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ if (!_run_in_place)
+ {
+ add_3D_tensor_argument(idx, _output, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
new file mode 100644
index 000000000..c1a2ad0be
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(argminmax_axis, 1);
+
+ return out_shape;
+}
+} // namespace
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t argminmax_axis, ArgOperation op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
+ DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match argminmax_axis");
+
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ argminmax_axis >= 0 && argminmax_axis < num_dimensions,
+ "argminmax_axis must be greater than or equal to 0 and less than (input's rank).");
+ return Status{};
+}
+
+} // namespace
+
+CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {}
+
+void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const uint32_t argminmax_axis, ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis));
+
+ _input = input;
+ _output = output;
+ _argminmax_axis = argminmax_axis;
+
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis));
+
+ // Construct kernel name for argmax and argmin based on axis
+ std::string kernel_name = "arg_op";
+ int op_code = 0;
+ if (op == ArgOperation::MAX)
+ {
+ op_code = 1;
+ }
+ else if (op == ArgOperation::MIN)
+ {
+ op_code = 2;
+ }
+ else
+ throw std::runtime_error("Operation not supported, yet");
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+ build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t argminmax_axis, ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op));
+
+ return Status{};
+}
+
+void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ _kernel.setArg<cl_int>(idx++, _argminmax_axis);
+ _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
new file mode 100644
index 000000000..1c505b4d5
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_UNUSED(policy);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
+
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->data_type() == DataType::U8 &&
+ (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+ "Output can only be U8 if both inputs are U8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
+ ITensorInfo *output)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output, out_shape);
+
+ if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output, Format::S16);
+ }
+ else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output, Format::F16);
+ }
+ else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output, Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input1->info(), input2->info(), output->info(), policy));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ const bool has_float_out = is_data_type_float(output->info()->data_type());
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts));
+
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
+ input2->clone().get(),
+ output->clone().get())
+ .first);
+
+ return Status{};
+}
+
+void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLArithmeticSubtractionExKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
new file mode 100644
index 000000000..b0016d23c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t *block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2),
+ "Input Depth should be equal to Output Depth");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3),
+ "Input batch should be equal to (output batch * block size[0] *block size[1])");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) &&
+ !(output->dimension(1) % block_size[0]),
+ "Output height and width should be divisible by block size[0] "
+ "and block_size[1] respectively");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) &&
+ (output->dimension(1) == input->dimension(1) * block_size[0]),
+ "Output height and width should be equal to "
+ "input_height*blocksize[0] and input_width*blocksize[1] "
+ "respectively");
+
+ return Status{};
+}
+
+} // namespace
+
+CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t *block_size)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0]));
+ build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1]));
+ build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3)));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_out);
+ add_4D_tensor_argument(idx, _output, slice_in);
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
new file mode 100644
index 000000000..3d2f2c702
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, BinaryLogicalOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "binary_logical_op";
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+
+ int op_code = 0;
+ switch (op)
+ {
+ case BinaryLogicalOperation::AND:
+ op_code = 1;
+ break;
+ case BinaryLogicalOperation::OR:
+ op_code = 2;
+ break;
+ default:
+ throw std::runtime_error("Operation not supported, yet");
+ }
+
+ build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLBinaryLogicalOpKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
index b019e8c33..bf7ebae3f 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -17,15 +17,8 @@
#include "arm_compute/core/CL/kernels/CLCastKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
using namespace arm_compute;
@@ -60,8 +53,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
{
const float scale_in = input->info()->quantization_info().scale;
const int offset_in = input->info()->quantization_info().offset;
- build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
- build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+ build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+ build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
_kernel = static_cast<cl::Kernel>(
CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
@@ -70,8 +63,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
{
const float scale_in = output->info()->quantization_info().scale;
const int offset_in = output->info()->quantization_info().offset;
- build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
- build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+ build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+ build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
_kernel = static_cast<cl::Kernel>(
CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
@@ -88,7 +81,7 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, input->info()->valid_region());
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
new file mode 100644
index 000000000..5af5b16ea
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32, DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, const ComparisonOperation &op)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "comparison_op";
+ int op_code = 0;
+
+ switch (op)
+ {
+ case ComparisonOperation::EQUAL:
+ op_code = 1;
+ break;
+ case ComparisonOperation::NOT_EQUAL:
+ op_code = 2;
+ break;
+ default:
+ throw std::runtime_error(" Operation not supported, yet");
+ }
+
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+ build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type())));
+ build_opts.emplace(
+ ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized_asymmetric(input1->info()->data_type()) &&
+ ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) ||
+ (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale)))
+ {
+ build_opts.emplace("-DOFFSET_IN1=" +
+ support::cpp11::to_string(input1->info()->quantization_info().offset));
+ build_opts.emplace("-DOFFSET_IN2=" +
+ support::cpp11::to_string(input2->info()->quantization_info().offset));
+ build_opts.emplace("-DSCALE_IN1=" +
+ support::cpp11::to_string(input1->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_IN2=" +
+ support::cpp11::to_string(input2->info()->quantization_info().scale));
+ kernel_name += "_qasymm8";
+ }
+
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), out_shape);
+
+ if (input1->info()->data_type() == DataType::S16 ||
+ input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if (input1->info()->data_type() == DataType::F16 &&
+ input2->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if (input1->info()->data_type() == DataType::F32 ||
+ input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLComparisonOpKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
new file mode 100644
index 000000000..c386e3312
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size,
+ "Output width should be equal to (Input width * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size,
+ "Output height should be equal to (Input height * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0,
+ "Input depth should be divisible by (block size * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->dimension(2) == input->dimension(2) / (block_size * block_size),
+ "Output depth should be equal to (Input depth / (block size * block size))");
+
+ return Status{};
+}
+} // namespace
+
+CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+{
+ // DO NOTHING
+}
+
+void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t block_size)
+{
+
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..0862b78bf
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
+ : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *lookups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+ return Status{};
+}
+
+void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const ICLTensor *lookups)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+ _input = input;
+ _output = output;
+ _lookups = lookups;
+
+ // Set kernel build options
+ std::stringstream kernel_name;
+ std::set<std::string> build_opts;
+ kernel_name << "embedding_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ Window win_lookup;
+ win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_in);
+ add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
new file mode 100644
index 000000000..b1ee21bdc
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLExpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Auto initialize output
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(),
+ input->info()->quantization_info());
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLExpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
index 23efafa6a..ae2801e2b 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
@@ -17,26 +17,14 @@
#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
using namespace arm_compute;
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+constexpr unsigned int num_elems_processed_per_iteration = 1;
Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
const ITensorInfo *output)
@@ -46,6 +34,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
return Status{};
}
@@ -57,8 +46,7 @@ CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(n
void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
_input1 = input1;
_input2 = input2;
@@ -89,11 +77,10 @@ void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration = 1;
Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
new file mode 100644
index 000000000..cd7b21c6d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLHashtableLookupKernel::CLHashtableLookupKernel()
+ : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Output's shape was not set");
+
+ ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) &&
+ output->dimension(output->num_dimensions() - 1) == lookups->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+
+ return Status{};
+}
+
+void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
+ const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+ _lookups = lookups;
+ _keys = keys;
+ _input = input;
+ _output = output;
+ _hits = hits;
+
+ // Make _lookup_indices tensor
+ _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+ _lookup_indices->allocator()->init(
+ TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+ _lookup_indices->allocator()->allocate();
+
+ // Set kernel build options
+ std::stringstream kernel_name;
+ std::set<std::string> build_opts;
+ kernel_name << "hashtable_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const_cast<ICLTensor *>(_lookups)->map(queue);
+ const_cast<ICLTensor *>(_keys)->map(queue);
+ _hits->map(queue);
+ _lookup_indices->map(queue);
+
+ // Set values of hits
+ const int32_t *lookups_buf =
+ reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+ const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
+ uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+ int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
+
+ std::map<int32_t, size_t> key_map;
+ const size_t keys_num = _keys->info()->dimension(0);
+ for (size_t key_index = 0; key_index < keys_num; key_index++)
+ {
+ key_map[keys_buf[key_index]] = key_index;
+ }
+
+ const size_t lookups_num = _lookups->info()->dimension(0);
+ for (size_t i = 0; i < lookups_num; ++i)
+ {
+ const auto lookup_value = lookups_buf[i];
+ const auto it = key_map.find(lookup_value);
+ if (it != key_map.end())
+ {
+#if defined(DEBUG)
+ if (it->second >= lookups_num)
+ ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
+#endif // defined(DEBUG)
+ lookup_indices_buf[i] = static_cast<int32_t>(it->second);
+ hits_buf[i] = static_cast<uint8_t>(1);
+ }
+ else
+ {
+ lookup_indices_buf[i] = -1;
+ hits_buf[i] = static_cast<uint8_t>(0);
+ }
+ }
+
+ const_cast<ICLTensor *>(_lookups)->unmap(queue);
+ const_cast<ICLTensor *>(_keys)->unmap(queue);
+ _hits->unmap(queue);
+ _lookup_indices->unmap(queue);
+
+ Window win = window.collapse(ICLKernel::window(), 2, 4);
+
+ Window win_lookup;
+ win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, win);
+ add_4D_tensor_argument(idx, _output, win);
+ add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+ enqueue(queue, *this, win);
+ } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
new file mode 100644
index 000000000..80d99dd3b
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(),
+ output->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ return Status{};
+}
+
+} // namespace
+
+CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
new file mode 100644
index 000000000..12bbe910f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Checks performed when output is configured
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+
+ const unsigned int norm_size = norm_info.norm_size();
+ bool is_in_map = norm_info.is_in_map();
+
+ const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
+ const BorderSize border_size = BorderSize(0, border_width);
+
+ const unsigned int num_elems_processed_per_iteration = 4;
+ const unsigned int num_elems_read_per_iteration =
+ is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2))
+ : num_elems_processed_per_iteration;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside
+ // the kernel, avoiding padding
+ AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLNormalizationLayerExKernel::CLNormalizationLayerExKernel()
+ : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
+{
+}
+
+BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; }
+
+void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output,
+ NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
+
+ _input = input;
+ _output = output;
+
+ const unsigned int num_elems_processed_per_iteration = 4;
+ const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option(
+ ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
+ build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
+ build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
+ build_opts.add_option(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size())));
+ build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
+ build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
+
+ // Create kernel
+ std::string kernel_name =
+ _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "normalization_layer_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(
+ static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(norm_info.norm_size());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+
+ return Status{};
+}
+
+void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
+ Window slice = window_collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ } while (window_collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
new file mode 100644
index 000000000..241f8ae4d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info()));
+
+ _input = input;
+ _alpha = alpha;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "prelu";
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ build_opts.emplace("-DOFF_IN1=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ build_opts.emplace("-DOFF_IN2=" +
+ support::cpp11::to_string(alpha->info()->quantization_info().offset));
+ build_opts.emplace("-DOFF_OUT=" +
+ support::cpp11::to_string(output->info()->quantization_info().offset));
+ build_opts.emplace("-DSCALE_IN1=" +
+ support::cpp11::to_string(input->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_IN2=" +
+ support::cpp11::to_string(alpha->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_OUT=" +
+ support::cpp11::to_string(output->info()->quantization_info().scale));
+ kernel_name += "_qasymm8";
+ }
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), out_shape);
+
+ if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if (input->info()->data_type() == DataType::F32 ||
+ alpha->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+
+ AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input->info()->tensor_shape();
+ const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_input1);
+ add_3D_tensor_argument(idx, _alpha, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPReLUKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
new file mode 100644
index 000000000..99b54c822
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
+ const ITensorInfo *pad_size_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 &&
+ input_info->num_dimensions() <= 4,
+ "Pad kernel supports upto 4-D input tensor");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input_info->num_dimensions() == output_info->num_dimensions(),
+ "output tensor should have same number of dimensions as input tensor");
+
+ if (input_info->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
+ output_info->quantization_info(),
+ "The input and output quantization info are different!");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
+
+void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
+
+ _input = input;
+ _output = output;
+ _pad_size = pad_size;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
+ build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
+ if (input->info()->data_type() == DataType::QASYMM8)
+ {
+ build_opts.emplace("-DZERO_VALUE=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ }
+ else
+ {
+ build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ _pad_size->map(queue);
+
+ // Padding values only for up, top, left and front are required based on the rank of tensor
+ int rank = _pad_size->info()->dimension(1);
+
+ auto pad_batch_up =
+ (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0;
+ auto pad_height_top =
+ (rank >= 2)
+ ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1}))
+ : 0;
+ auto pad_width_left = (rank >= 1)
+ ? *reinterpret_cast<const int32_t *>(
+ _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1}))
+ : 0;
+ auto pad_depth_front =
+ (rank >= 3)
+ ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3}))
+ : 0;
+
+ _pad_size->unmap(queue);
+
+ // Pad_values which needs to be passed
+ const cl_int4 paddingValues = {
+ {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
+ static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ _kernel.setArg<cl_int4>(idx++, paddingValues);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
new file mode 100644
index 000000000..aa094761c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute;
+
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
+{
+ TensorShape output_shape = input->tensor_shape();
+ permute(output_shape, perm);
+ return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const PermutationVector &perm)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+
+ // Validate configured output
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+ return Status{};
+}
+} // namespace
+
+CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {}
+
+void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const PermutationVector &perm)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
+
+ _input = input;
+ _output = output;
+ _perm = perm;
+
+ const TensorShape output_shape = get_output_shape(input->info(), perm);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+ // Create kernel
+ std::set<std::string> build_opts;
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+ // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector
+ build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0]));
+ build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1]));
+ build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2]));
+ build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3]));
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PermutationVector &perm)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
+
+ return Status{};
+}
+
+void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
index a3e0163de..b985aa737 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
@@ -17,20 +17,8 @@
#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
using namespace arm_compute;
@@ -45,12 +33,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_UNUSED(overflow_policy);
ARM_COMPUTE_UNUSED(rounding_policy);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8,
- DataType::QS16, DataType::S16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8,
- DataType::QS16, DataType::S16, DataType::F16,
- DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
const TensorShape &out_shape =
@@ -58,21 +44,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
"Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
-
- if (is_data_type_fixed_point(input1->data_type()))
- {
- // All data types must be all QS8 or all QS16
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1,
- "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
- }
// Validate in case of configured output
if (output->total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8,
- DataType::QS16, DataType::S16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
output->data_type() == DataType::U8 &&
@@ -81,11 +57,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
"Wrong shape for output");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
- if (is_data_type_fixed_point(input1->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
- }
}
return Status{};
@@ -191,14 +162,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
{
compute_type = "int";
}
- else if (input1->info()->data_type() == DataType::QS8)
- {
- compute_type = "qs8";
- }
- else if (input1->info()->data_type() == DataType::QS16)
- {
- compute_type = "qs16";
- }
else
{
compute_type = "ushort";
@@ -218,11 +181,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
: "-DSATURATE");
build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
: "-DROUND=_rte");
- if (is_data_type_fixed_point(input1->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" +
- support::cpp11::to_string(input1->info()->fixed_point_position()));
- }
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
@@ -245,7 +203,7 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
_kernel.setArg(idx++, scale);
}
- ICLKernel::configure(win_config.second);
+ ICLKernel::configure_internal(win_config.second);
}
Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
deleted file mode 100644
index 168b246bf..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
-{
- // We can handle for simple case only
- // Input rank: 2
- // Output rank: 1
- // Axis: one axis value, restrict to 1
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
- "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
- "Output same type allowed for input and output");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
- "Only support for output dimension 1");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
- "Only support for input dimension 2");
- }
-
- return Status{};
-}
-
-} // namespace
-
-CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {}
-
-void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
-
- _input = input;
- _output = output;
- _axis = axis;
-
- // Configure kernel window
- int cols = _input->info()->tensor_shape()[0];
- int rows = _input->info()->tensor_shape()[1];
- Window win;
- win.set(0, Window::Dimension(0, cols, 1));
- win.set(1, Window::Dimension(0, rows, 1));
-
- // Construct kernel name
- std::string kernel_name = "reduce_max";
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- ICLKernel::configure(win);
-}
-
-Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
-
- return Status{};
-}
-
-void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window window_input = window;
- Window slice_input = window_input.first_slice_window_1D();
-
- do
- {
- Window slice_output = slice_input.shift_dimensions(1);
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, slice_input);
- add_1D_tensor_argument(idx, _output, slice_output);
- enqueue(queue, *this, slice_input);
-
- } while (window_input.slide_window_slice_1D(slice_input));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
new file mode 100644
index 000000000..f581780e1
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+namespace
+{
+// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
+// are the same.
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(axis, 1);
+
+ return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32, DataType::S32);
+ if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
+ "Not support QASYMM8, yet");
+ }
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ axis >= 0 && axis < num_dimensions,
+ "axis must be greater than or equal to 0 and less than (input's rank).");
+
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match axis");
+
+ return Status{};
+}
+} // namespace
+
+CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const uint32_t axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+ // Construct kernel name
+ std::string kernel_name;
+ int op_code = 0;
+ if (op == ReduceOperation::MAX)
+ {
+ kernel_name = "reduce_min_max";
+ op_code = 1;
+ }
+ else if (op == ReduceOperation::MIN)
+ {
+ kernel_name = "reduce_min_max";
+ op_code = 2;
+ }
+ else if (op == ReduceOperation::SUM)
+ {
+ kernel_name = "reduce_sum_mean";
+ op_code = 3;
+ }
+ else if (op == ReduceOperation::MEAN)
+ {
+ kernel_name = "reduce_sum_mean";
+ op_code = 4;
+ }
+ else
+ throw std::runtime_error("Operation not supported, yet");
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+ build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+ return Status{};
+}
+
+void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ _kernel.setArg<cl_int>(idx++, _axis);
+ _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+ // Support dimensions up to 4
+ Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
+ // of input and output are the same
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+ idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
deleted file mode 100644
index 84a77122d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- std::vector<uint32_t> axis)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions,
- "Reduction axis greater than max number of dimensions");
-
- std::vector<uint32_t>::const_iterator it;
- bool axis_w = false;
- bool axis_h = false;
- for (it = axis.begin(); it != axis.end(); ++it)
- {
- if ((*it) == 0)
- {
- axis_w = true;
- }
- else if ((*it) == 1)
- {
- axis_h = true;
- }
- else
- {
- ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
- }
- }
- // TODO Other axises (currently, only axises for both width and height are supported.)
- if (!axis_w || !axis_h)
- {
- ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
- }
-
- if (output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
- std::vector<uint32_t> axis)
-{
- // Output tensor auto initialization if not yet initialized
- TensorShape output_shape{input->tensor_shape()};
- output_shape.set(0, 1);
- output_shape.set(1, 1);
- auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(),
- input->fixed_point_position());
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
- const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-
- Window win = calculate_max_window(
- *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
- num_elems_processed_per_iteration_y);
- AccessWindowHorizontal output_access(output, 0, 1);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, output->valid_region());
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
-
- return std::make_tuple(err, win);
-}
-} // namespace
-
-CLReductionMeanKernel::CLReductionMeanKernel()
- : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size()
-{
-}
-
-BorderSize CLReductionMeanKernel::border_size() const { return _border_size; }
-
-void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output,
- std::vector<uint32_t> axis)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis));
-
- _input = input;
- _output = output;
- _reduction_axis = axis;
-
- constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
-
- // Set border size
- _border_size = BorderSize(
- ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) -
- input->info()->dimension(0));
-
- // Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- // build_opts.emplace(("-DVEC_SIZE=" +
- // support::cpp11::to_string(num_elems_processed_per_iteration)));
- if (is_data_type_fixed_point(input->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" +
- support::cpp11::to_string(input->info()->fixed_point_position()));
- }
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure(std::get<1>(win_config));
-}
-
-Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- std::vector<uint32_t> axis)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
- validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
-
- return Status{};
-}
-
-void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Set out window
- Window out_window(window);
- out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- // Get first input and output slices
- Window in_slice = window.first_slice_window_2D();
- Window out_slice = out_window.first_slice_window_2D();
-
- // Set local sums buffer
- // TODO work_group
- unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
-
- unsigned int idx = 2 * num_arguments_per_2D_tensor();
- _kernel.setArg(idx++, local_sum_size, nullptr);
- _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height
- _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) *
- _input->info()->dimension(1))); // divider
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice);
- } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
new file mode 100644
index 000000000..6b0697e89
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
+ const ITensorInfo *padding_size, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
+ "The number of dimensions of input should be equal to output");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
+ "The input and output layouts are different!");
+
+ // TODO Support other cases
+ if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
+ else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
+ "CLSpaceToBatchNDKernel supports dimensions up to 4");
+
+ if (input->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
+ "The input and output quantization info are different!");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
+ const ICLTensor *padding_size, ICLTensor *output)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
+
+ _input = input;
+ _block_size = block_size;
+ _padding_size = padding_size;
+ _output = output;
+
+ // Set kernel build options
+ // TODO Support other cases
+ std::string kernel_name = "space_to_batch_4d";
+ std::set<std::string> build_opts;
+ Window win;
+
+ if (input->info()->data_layout() == DataLayout::NCHW)
+ {
+ kernel_name += "_nchw";
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+ win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+ }
+ else if (input->info()->data_layout() == DataLayout::NHWC)
+ {
+ kernel_name += "_nhwc";
+ build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DVEC_SIZE=" +
+ support::cpp11::to_string(num_elems_processed_per_iteration));
+
+ win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->info()->valid_region());
+
+ if (window_changed)
+ {
+ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported layout");
+ }
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
+ if (input->info()->data_type() == DataType::QASYMM8)
+ {
+ build_opts.emplace("-DZERO_VALUE=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ }
+ else
+ {
+ build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ }
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+#if defined(DEBUG)
+ const_cast<ICLTensor *>(_block_size)->map(queue);
+ const_cast<ICLTensor *>(_padding_size)->map(queue);
+
+ const size_t num_dimensions = _input->info()->num_dimensions();
+ const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
+ int32_t batch_size = _input->info()->dimension(num_dimensions - 1);
+ for (size_t i = 0; i < num_spacial_dimensions; ++i)
+ {
+ const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
+ const int32_t padding_size_pre =
+ *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
+ const int32_t padding_size_post =
+ *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
+
+ ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
+ ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
+ "Padding size should be greater than or equal to 0");
+
+ if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(i) !=
+ (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
+ "Dimension value of spatial block does not match output's dimension value");
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
+ (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
+ padding_size_pre + padding_size_post) /
+ block_size,
+ "Dimension value of spatial block does not match output's dimension value");
+ }
+
+ batch_size *= block_size;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(num_dimensions - 1) != batch_size,
+ "Output batch size should be equal to input batch size * (multiplication of all block size)");
+
+ const_cast<ICLTensor *>(_block_size)->unmap(queue);
+ const_cast<ICLTensor *>(_padding_size)->unmap(queue);
+#endif // defined(DEBUG)
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Set block size window
+ Window win_block = calculate_max_window(*_block_size->info(), Steps());
+
+ // Set padding size window
+ Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ add_1D_tensor_argument(idx, _block_size, win_block);
+ add_2D_tensor_argument(idx, _padding_size, win_padding);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
new file mode 100644
index 000000000..5d6329edc
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3),
+ "Input batch should be equal to Output batch");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input->dimension(2) * block_size * block_size == output->dimension(2),
+ "Output depth should be equal to (input depth * block size *block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) &&
+ !(input->dimension(1) % block_size),
+ "Input height and width should be divisible by block size");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) &&
+ (output->dimension(1) == (input->dimension(1) / block_size)),
+ "Output height and width should be equal to "
+ "input_height/blocksize and input_width/blocksize respectively");
+
+ return Status{};
+}
+
+} // namespace
+
+CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t block_size)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
new file mode 100644
index 000000000..260bc39f1
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLSquaredDifferenceKernel::CLSquaredDifferenceKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), out_shape);
+
+ if (input1->info()->data_type() == DataType::F16 &&
+ input2->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if (input1->info()->data_type() == DataType::F32 ||
+ input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLSquaredDifferenceKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
index 80ffd423a..48146a43a 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
@@ -14,43 +14,30 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include <string>
-
-using namespace std;
using namespace arm_compute;
-static const int32_t maxDim = 4;
-
-CLStridedSliceKernel::CLStridedSliceKernel()
+CLStridedSliceExKernel::CLStridedSliceExKernel()
: _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
_stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
{
}
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *begin, const ITensorInfo *end,
- const ITensorInfo *strides, int32_t beginMask,
- int32_t endMask, int32_t shrinkAxisMask)
+Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *begin, const ITensorInfo *end,
+ const ITensorInfo *strides, int32_t beginMask,
+ int32_t endMask, int32_t shrinkAxisMask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16,
- DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
@@ -153,15 +140,6 @@ inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
return stop;
}
-inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
-{
- int32_t offset = b * shape[2] * shape[1] * shape[0];
- offset += d * shape[1] * shape[0];
- offset += h * shape[0];
- offset += w;
- return offset;
-}
-
inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
{
int32_t ret = 0;
@@ -177,10 +155,10 @@ inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
return ret;
}
-void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
- ICLTensor *beginData, ICLTensor *endData,
- ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
- int32_t shrinkAxisMask)
+void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output,
+ ICLTensor *beginData, ICLTensor *endData,
+ ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
endData->info(), stridesData->info(), beginMask, endMask,
@@ -195,48 +173,31 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
_endMask = endMask;
_shrinkAxisMask = shrinkAxisMask;
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
// Set kernel build options
std::set<std::string> build_opts;
build_opts.emplace("-DELEMENT_DATA_TYPE=" +
get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
// Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts));
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts));
- // Create output's window without padding
- TensorShape collapsed = output->info()->tensor_shape();
- collapsed.collapse(4);
- TensorInfo info = *output->info();
- info.set_tensor_shape(collapsed);
- Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration));
-
- ICLKernel::configure(win);
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ ICLKernel::configure_internal(win);
}
-void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- // Create input window
- TensorShape collapsed = _input->info()->tensor_shape();
- collapsed.collapse(4);
- TensorInfo info = *_input->info();
- info.set_tensor_shape(collapsed);
- Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size()));
-
_beginData->map(queue);
_endData->map(queue);
_stridesData->map(queue);
- std::vector<int32_t> dimsIn;
- std::vector<int32_t> dimsOut;
std::vector<int32_t> starts;
- std::vector<int32_t> stops;
std::vector<int32_t> strides;
for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
@@ -246,22 +207,13 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
- stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n],
- reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape,
- n));
-
strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
- dimsIn.emplace_back(shape[n]);
- dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n]));
}
for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
{
starts.emplace_back(0);
- stops.emplace_back(1);
strides.emplace_back(1);
- dimsIn.emplace_back(1);
- dimsOut.emplace_back(1);
}
// TODO: Apply shrinkAxisMask
@@ -269,20 +221,7 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
_stridesData->unmap(queue);
_endData->unmap(queue);
- // Set parameters
- unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
- const cl_int4 dimsInArg = {{
- static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]),
- static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]),
- }};
- _kernel.setArg<cl_int4>(idx++, dimsInArg);
-
- const cl_int4 dimsOutArg = {{
- static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]),
- static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]),
- }};
- _kernel.setArg<cl_int4>(idx++, dimsOutArg);
-
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
const cl_int4 startsArg = {{
static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
@@ -295,10 +234,20 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
}};
_kernel.setArg<cl_int4>(idx++, stridesArg);
- // TODO: Apply slicing output's window
- idx = 0;
- add_1D_tensor_argument(idx, _input, win_in);
- add_1D_tensor_argument(idx, _output, window);
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
- enqueue(queue, *this, window);
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
index d95b485b7..073c2f7bb 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -17,15 +17,8 @@
#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <climits>
-#include <cassert>
namespace arm_compute
{
@@ -59,7 +52,7 @@ void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTens
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, 1, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
@@ -102,7 +95,7 @@ void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffe
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
@@ -147,7 +140,7 @@ void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -192,7 +185,7 @@ void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -236,7 +229,7 @@ void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buf
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -275,7 +268,7 @@ void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -322,7 +315,7 @@ void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
@@ -365,7 +358,7 @@ void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, in
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
@@ -404,7 +397,7 @@ void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
@@ -449,7 +442,7 @@ void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, k, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)