summaryrefslogtreecommitdiff
path: root/libs/ARMComputeEx
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ARMComputeEx')
-rw-r--r--libs/ARMComputeEx/CMakeLists.txt21
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h189
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h57
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h71
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h87
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h73
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h78
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h106
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h301
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h45
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h49
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h72
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h81
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h73
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h69
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h109
-rw-r--r--libs/ARMComputeEx/resolve_includes.py102
-rw-r--r--libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp547
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl138
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl148
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h565
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl106
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h344
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h406
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl96
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl103
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl119
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl60
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl69
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl104
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl111
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl138
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl279
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp109
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp142
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp322
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp129
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp198
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp304
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp475
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp29
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp38
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp52
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp121
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp51
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp307
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp305
-rw-r--r--libs/ARMComputeEx/src/runtime/topk_v2.h143
48 files changed, 7641 insertions, 0 deletions
diff --git a/libs/ARMComputeEx/CMakeLists.txt b/libs/ARMComputeEx/CMakeLists.txt
new file mode 100644
index 000000000..2483fb55d
--- /dev/null
+++ b/libs/ARMComputeEx/CMakeLists.txt
@@ -0,0 +1,21 @@
+if("${TARGET_ARCH}" STREQUAL "x86_64")
+ return()
+endif()
+
+nnfw_find_package(ARMCompute REQUIRED)
+
+set(ACL_EX_BASE ${CMAKE_SOURCE_DIR}/libs/ARMComputeEx)
+
+file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp")
+
+# generate embeded cl_kernel
+execute_process (
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/libs/ARMComputeEx"
+ COMMAND bash -c "python resolve_includes.py"
+)
+
+add_library(arm_compute_ex SHARED ${ACL_EX_SRCS})
+set_target_properties(arm_compute_ex PROPERTIES COMPILE_FLAGS "-DEMBEDDED_KERNELS=1")
+target_include_directories(arm_compute_ex PUBLIC ${CMAKE_SOURCE_DIR}/libs/ARMComputeEx)
+target_link_libraries(arm_compute_ex arm_compute_core)
+install(TARGETS arm_compute_ex DESTINATION lib)
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
new file mode 100644
index 000000000..026487077
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+
+/** CLKernelLibrary class */
+class CLKernelLibraryEx
+{
+ using StringSet = std::set<std::string>;
+
+private:
+ /** Default Constructor. */
+ CLKernelLibraryEx();
+
+public:
+ /** Prevent instances of this class from being copied */
+ CLKernelLibraryEx(const CLKernelLibraryEx &) = delete;
+ /** Prevent instances of this class from being copied */
+ const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete;
+ /** Access the KernelLibrary singleton.
+ * @return The KernelLibrary instance.
+ */
+ static CLKernelLibraryEx &get();
+ /** Initialises the kernel library.
+ *
+ * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded.
+ * @param[in] context (Optional) CL context used to create programs.
+ * @param[in] device (Optional) CL device for which the programs are created.
+ */
+ void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(),
+ cl::Device device = cl::Device::getDefault())
+ {
+ _kernel_path = std::move(kernel_path);
+ _context = std::move(context);
+ _device = std::move(device);
+ }
+ /** Sets the path that the kernels reside in.
+ *
+ * @param[in] kernel_path Path of the kernel.
+ */
+ void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; };
+ /** Gets the path that the kernels reside in.
+ */
+ std::string get_kernel_path() { return _kernel_path; };
+ /** Gets the source of the selected program.
+ *
+ * @param[in] program_name Program name.
+ *
+ * @return Source of the selected program.
+ */
+ std::string get_program_source(const std::string &program_name);
+ /** Sets the CL context used to create programs.
+ *
+ * @note Setting the context also resets the device to the
+ * first one available in the new context.
+ *
+ * @param[in] context A CL context.
+ */
+ void set_context(cl::Context context)
+ {
+ _context = std::move(context);
+ if (_context.get() == nullptr)
+ {
+ _device = cl::Device();
+ }
+ else
+ {
+ const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
+
+ if (cl_devices.empty())
+ {
+ _device = cl::Device();
+ }
+ else
+ {
+ _device = cl_devices[0];
+ }
+ }
+ }
+
+ /** Accessor for the associated CL context.
+ *
+ * @return A CL context.
+ */
+ cl::Context &context() { return _context; }
+
+ /** Sets the CL device for which the programs are created.
+ *
+ * @param[in] device A CL device.
+ */
+ void set_device(cl::Device device) { _device = std::move(device); }
+
+ /** Return the device version
+ *
+ * @return The content of CL_DEVICE_VERSION
+ */
+ std::string get_device_version();
+ /** Creates a kernel from the kernel library.
+ *
+ * @param[in] kernel_name Kernel name.
+ * @param[in] build_options_set Kernel build options as a set.
+ *
+ * @return The created kernel.
+ */
+ Kernel create_kernel(const std::string &kernel_name,
+ const StringSet &build_options_set = {}) const;
+ /** Find the maximum number of local work items in a workgroup can be supported for the kernel.
+ *
+ */
+ size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
+ /** Return the default NDRange for the device.
+ *
+ */
+ cl::NDRange default_ndrange() const;
+
+ /** Clear the library's cache of binary programs
+ */
+ void clear_programs_cache()
+ {
+ _programs_map.clear();
+ _built_programs_map.clear();
+ }
+
+ /** Access the cache of built OpenCL programs */
+ const std::map<std::string, cl::Program> &get_built_programs() const
+ {
+ return _built_programs_map;
+ }
+
+ /** Add a new built program to the cache
+ *
+ * @param[in] built_program_name Name of the program
+ * @param[in] program Built program to add to the cache
+ */
+ void add_built_program(const std::string &built_program_name, cl::Program program);
+
+private:
+ /** Load program and its dependencies.
+ *
+ * @param[in] program_name Name of the program to load.
+ */
+ const Program &load_program(const std::string &program_name) const;
+ /** Concatenates contents of a set into a single string.
+ *
+ * @param[in] s Input set to concatenate.
+ *
+ * @return Concatenated string.
+ */
+ std::string stringify_set(const StringSet &s) const;
+
+ cl::Context _context; /**< Underlying CL context. */
+ cl::Device _device; /**< Underlying CL device. */
+ std::string _kernel_path; /**< Path to the kernels folder. */
+ mutable std::map<std::string, const Program>
+ _programs_map; /**< Map with all already loaded program data. */
+ mutable std::map<std::string, cl::Program>
+ _built_programs_map; /**< Map with all already built program data. */
+ static const std::map<std::string, std::string>
+ _kernel_program_map; /**< Map that associates kernel names with programs. */
+ static const std::map<std::string, std::string>
+ _program_source_map; /**< Contains sources for all programs.
+ Used for compile-time kernel inclusion. >*/
+};
+}
+#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
new file mode 100644
index 000000000..6bd33bf8f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
+#define __ARM_COMPUTE_CLCASTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a cast operation */
+class CLCastKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLCastKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLCastKernel(const CLCastKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLCastKernel &operator=(const CLCastKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLCastKernel(CLCastKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLCastKernel &operator=(CLCastKernel &&) = default;
+ /** Default destructor */
+ ~CLCastKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
new file mode 100644
index 000000000..a51441aca
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__
+#define __ARM_COMPUTE_CLGATHERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the gather kernel.
+ *
+ */
+class CLGatherKernel : public ICLKernel
+{
+public:
+ /** Default constructor.*/
+ CLGatherKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLGatherKernel(const CLGatherKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLGatherKernel &operator=(const CLGatherKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLGatherKernel(CLGatherKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLGatherKernel &operator=(CLGatherKernel &&) = default;
+ /** Initialise the kernel's input, output and border mode.
+ *
+ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
+ * @param[in] input2 An input tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLGatherKernel
+ *
+ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
+ * @param[in] input2 An input tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGATHERKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
new file mode 100644
index 000000000..cd2b255bc
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
+#define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pixelwise division kernel.
+ *
+ */
+class CLPixelWiseDivisionKernel : public ICLKernel
+{
+public:
+ /** Default constructor.*/
+ CLPixelWiseDivisionKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default;
+ /** Initialise the kernel's input, output and border mode.
+ *
+ * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input2 An input tensor. Data types supported: same as @p input1.
+ * @param[out] output The output tensor, Data types supported: same as @p input1. Note:
+ * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * @param[in] scale Scale to apply after division.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n
+ * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
+ * even.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLPixelWiseDivisionKernel
+ *
+ * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input2 An input tensor info. Data types supported: same as @p input1.
+ * @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * @param[in] scale Scale to apply after division.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n
+ * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
new file mode 100644
index 000000000..a7d96cc5c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pixelwise division kernel.
+ *
+ */
+class CLReduceMaxKernel : public ICLKernel
+{
+public:
+ /** Default constructor.*/
+ CLReduceMaxKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLReduceMaxKernel(const CLReduceMaxKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLReduceMaxKernel(CLReduceMaxKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
+ /** Initialise the kernel's input, output and border mode.
+ *
+ * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] axis Axis to reduce
+ * @param[out] output The output tensor, Data types supported: same as @p input1. Note:
+ * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ */
+ void configure(const ICLTensor *input, int32_t axis, ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLReduceMaxKernel
+ *
+ * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] axis Axis to reduce
+ * @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+ void run_on_cpu(cl::CommandQueue &queue);
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ int32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
new file mode 100644
index 000000000..de9df3381
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel */
+class CLReductionMeanKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLReductionMeanKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLReductionMeanKernel(const CLReductionMeanKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLReductionMeanKernel(CLReductionMeanKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default;
+ /** Default destructor */
+ ~CLReductionMeanKernel() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW.
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLReductionMeanKernel.
+ *
+ * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
+ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
+ * input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ std::vector<uint32_t> axis);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ std::vector<uint32_t> _reduction_axis;
+ BorderSize _border_size;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
new file mode 100644
index 000000000..248ae6635
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to extract a strided slice of a tensor */
+class CLStridedSliceKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLStridedSliceKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
+ /** Default destructor */
+ ~CLStridedSliceKernel() = default;
+ /** Set the input and output of the kernel
+ *
+ * @param[in] input Source tensor. Data type supported:
+ * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] beginData The begin tensor. Data types supported: S32.
+ * The number of dimensions must be 1.
+ * The length must be the same as the number of dimensions of input.
+ * @param[in] endData The end tensor. Data types supported: S32.
+ * The number of dimensions must be 1.
+ * The length must be the same as the number of dimensions of input.
+ * @param[in] strideData The stride tensor. Data types supported: S32.
+ * The number of dimensions must be 1.
+ * The length must be the same as the number of dimensions of input.
+ * @param[in] beginMask Mask for begin
+ * @param[in] endMask Mask for end
+ * @param[in] shrinkAxisMask Mask for shrink axis.
+ *
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+ ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLStridedSliceKernel
+ *
+ * @param[in] input The input tensor info. Data types supported:
+ * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * @param[in] begin The begin tensor info. Data types supported: S32.
+ * The number of dimensions must be 1.
+ * The length must be the same as the number of dimensions of input.
+ * @param[in] end The end tensor info. Data types supported: S32.
+ * The number of dimensions must be 1.
+ * The length must be the same as the number of dimensions of input.
+ * @param[in] stride The stride tensor info. Data types supported: S32.
+ * The number of dimensions must be 1.
+ * The length must be the same as the number of dimensions of input.
+ * @param[in] beginMask Mask for begin
+ * @param[in] endMask Mask for end
+ * @param[in] shrinkAxisMask Mask for shrink axis.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *begin, const ITensorInfo *end,
+ const ITensorInfo *stride, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /** Source tensor */
+ ICLTensor *_output; /** Destination tensor */
+ ICLTensor *_beginData; /** Start indices of input tensor */
+ ICLTensor *_endData; /** Stop indices of input tensor */
+ ICLTensor *_stridesData; /** Strides tensor */
+ int32_t _beginMask; /** Begin mask */
+ int32_t _endMask; /** End mask */
+ int32_t _shrinkAxisMask; /** Shrink axis mask */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
new file mode 100644
index 000000000..5c567f38e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <array>
+
+// these parameters can be changed
+#define _ITEMS 16 // number of items in a group
+#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS
+#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram
+#define PERMUT // store the final permutation
+////////////////////////////////////////////////////////
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLTopKV2Single : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLTopKV2Single();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2Single(const CLTopKV2Single &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
+ /** Allow instances of this class to be moved */
+ CLTopKV2Single(CLTopKV2Single &&) = default;
+ /** Allow instances of this class to be moved */
+ CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
+
+ void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+ cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_topk_values;
+ ICLTensor *_topk_indices;
+};
+
+class CLTopKV2Init : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLTopKV2Init();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2Init(const CLTopKV2Init &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
+ /** Allow instances of this class to be moved */
+ CLTopKV2Init(CLTopKV2Init &&) = default;
+ /** Allow instances of this class to be moved */
+ CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
+
+ void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+};
+
+class CLRadixSortHistogram : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLRadixSortHistogram();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
+ /** Allow instances of this class to be moved */
+ CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
+ /** Allow instances of this class to be moved */
+ CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
+
+ void configure(cl::Buffer *hist_buf, int bits, int n);
+
+ void setPass(int pass, cl::Buffer *in_key_buf)
+ {
+ _pass = pass;
+ _in_key_buf = in_key_buf;
+ }
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ int _pass;
+ cl::Buffer *_in_key_buf;
+};
+
+class CLRadixSortScanHistogram : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLRadixSortScanHistogram();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
+ /** Allow instances of this class to be moved */
+ CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
+ /** Allow instances of this class to be moved */
+ CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
+
+ void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+class CLRadixSortGlobalScanHistogram : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLRadixSortGlobalScanHistogram();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
+ /** Allow instances of this class to be moved */
+ CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
+ /** Allow instances of this class to be moved */
+ CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
+
+ void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+class CLRadixSortPasteHistogram : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLRadixSortPasteHistogram();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
+ /** Allow instances of this class to be moved */
+ CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
+ /** Allow instances of this class to be moved */
+ CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
+
+ void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+class CLRadixSortReorder : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLRadixSortReorder();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortReorder(const CLRadixSortReorder &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
+ /** Allow instances of this class to be moved */
+ CLRadixSortReorder(CLRadixSortReorder &&) = default;
+ /** Allow instances of this class to be moved */
+ CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
+
+ void configure(cl::Buffer *hist_buf, int bits, int n);
+
+ void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+ cl::Buffer *out_ind_buf)
+ {
+ _pass = pass;
+ _in_key_buf = in_key_buf;
+ _out_key_buf = out_key_buf;
+ _in_ind_buf = in_ind_buf;
+ _out_ind_buf = out_ind_buf;
+ }
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ int _pass;
+ cl::Buffer *_in_key_buf;
+ cl::Buffer *_out_key_buf;
+ cl::Buffer *_in_ind_buf;
+ cl::Buffer *_out_ind_buf;
+};
+
+class CLTopKV2FindFirstNegative : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLTopKV2FindFirstNegative();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
+ /** Allow instances of this class to be moved */
+ CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
+ /** Allow instances of this class to be moved */
+ CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
+
+ void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+ void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; }
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ cl::Buffer *_out_key_buf;
+};
+
+class CLTopKV2ReorderNegatives : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLTopKV2ReorderNegatives();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
+ /** Allow instances of this class to be moved */
+ CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
+ /** Allow instances of this class to be moved */
+ CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
+
+ void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+ void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+ cl::Buffer *out_ind_buf)
+ {
+ _in_key_buf = in_key_buf;
+ _out_key_buf = out_key_buf;
+ _in_ind_buf = in_ind_buf;
+ _out_ind_buf = out_ind_buf;
+ }
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ cl::Buffer *_in_key_buf;
+ cl::Buffer *_out_key_buf;
+ cl::Buffer *_in_ind_buf;
+ cl::Buffer *_out_ind_buf;
+};
+
+class CLTopKV2Store : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLTopKV2Store();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2Store(const CLTopKV2Store &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
+ /** Allow instances of this class to be moved */
+ CLTopKV2Store(CLTopKV2Store &&) = default;
+ /** Allow instances of this class to be moved */
+ CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
+
+ void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
+
+ void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_values;
+ ICLTensor *_indices;
+ cl::Buffer *_out_key_buf;
+ cl::Buffer *_out_ind_buf;
+};
+
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
new file mode 100644
index 000000000..63050067d
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCAST_H__
+#define __ARM_COMPUTE_CLCAST_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLCastKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLCast : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * The input tensor is [in, out] because its TensorInfo might be modified
+ * inside the kernel.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ */
+ void configure(ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLCAST_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
new file mode 100644
index 000000000..3ae7afe14
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLGATHER_H__
+#define __ARM_COMPUTE_CLGATHER_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLGatherKernel. */
+class CLGather : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs, output and convertion policy.
+ *
+ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
+ * @param[in] input2 An indexes tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ */
+ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLGather
+ *
+ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
+ * @param[in] input2 An indexes tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output);
+};
+}
+#endif /*__ARM_COMPUTE_CLGATHER_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
new file mode 100644
index 000000000..c1383e21f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPixelWiseDivisionKernel. */
+class CLPixelWiseDivision : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs, output and convertion policy.
+ *
+ * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * The input tensor is [in, out] because its TensorInfo might be
+ * modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
+ * The input tensor is [in, out] because its TensorInfo might be
+ * modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or
+ * 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
+ * even.
+ */
+ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
+ ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+ RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLPixelWiseDivision
+ *
+ * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input2 An input tensor info. Data types supported: same as @p input1.
+ * @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n
+ * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, float scale = 1.f,
+ ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+ RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+};
+}
+#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
new file mode 100644
index 000000000..14b473f33
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
+#define __ARM_COMPUTE_CLREDUCE_MAX_H__
+
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLTopKV2Kernel
+ */
+class CLReduceMax : public IFunction
+{
+public:
+ /** Constructor */
+ CLReduceMax();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLReduceMax(const CLReduceMax &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLReduceMax &operator=(const CLReduceMax &) = delete;
+ /** Allow instances of this class to be moved */
+ CLReduceMax(CLReduceMax &&) = default;
+ /** Allow instances of this class to be moved */
+ CLReduceMax &operator=(CLReduceMax &&) = default;
+ /** Initialise the kernel's inputs and outputs.
+ *
+ * @note When locations of min and max occurrences are requested, the reported number of locations
+ * is limited to the given array size.
+ *
+ * @param[in] input Input image. Data types supported: F32
+ * @param[in] axis Axis to reduce. Data type supported: S32
+ * @param[out] output indices related to top k values. Data types supported: F32.
+ */
+ void configure(ICLTensor *input, int32_t axis, ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLPixelWiseDivision
+ *
+ * @param[in] input Input image. Data types supported: F32
+ * @param[in] axis Axis to reduce. Data type supported: S32
+ * @param[out] output indices related to top k values. Data types supported: F32. *
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ void run_on_cpu();
+
+ int32_t _axis;
+
+ ICLTensor *_input;
+ ICLTensor *_output;
+
+ std::unique_ptr<ICLKernel> _kernel;
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
new file mode 100644
index 000000000..2081518c1
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__
+#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Perform reduction operation.
+ */
+class CLReductionMean : public IFunction
+{
+public:
+ /** Default Constructor.
+ */
+ CLReductionMean();
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW.
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1
+ */
+ void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLReductionMean.
+ *
+ * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
+ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ std::vector<uint32_t> axis);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ CLReductionMeanKernel _reduction_mean_kernel;
+ CLFillBorderKernel _fill_border_kernel;
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
new file mode 100644
index 000000000..f223a79be
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLStridedSliceKernel */
+class CLStridedSlice : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] input First tensor input. Data type supported:
+ * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+ ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask);
+};
+
+class CLStridedSliceCPU : public IFunction
+{
+public:
+ /** Initialise inputs and outputs
+ *
+ * @param[in] input First tensor input.
+ * @param[out] output Output tensor.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData,
+ ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask);
+
+ void run() override;
+
+private:
+ void run_on_cpu();
+
+ ICLTensor *_input;
+ ICLTensor *_output;
+ ICLTensor *_beginData;
+ ICLTensor *_endData;
+ ICLTensor *_stridesData;
+ int32_t _beginMask;
+ int32_t _endMask;
+ int32_t _shrinkAxisMask;
+};
+}
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
new file mode 100644
index 000000000..06cd1ee9b
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTOPK_V2_H__
+#define __ARM_COMPUTE_CLTOPK_V2_H__
+
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLTopKV2Kernel
+ */
+class CLTopKV2 : public IFunction
+{
+public:
+ /** Constructor */
+ CLTopKV2();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2(const CLTopKV2 &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTopKV2 &operator=(const CLTopKV2 &) = delete;
+ /** Allow instances of this class to be moved */
+ CLTopKV2(CLTopKV2 &&) = default;
+ /** Allow instances of this class to be moved */
+ CLTopKV2 &operator=(CLTopKV2 &&) = default;
+ /** Initialise the kernel's inputs and outputs.
+ *
+ * @note When locations of min and max occurrences are requested, the reported number of locations
+ * is limited to the given array size.
+ *
+ * @param[in] input Input image. Data types supported: U8/S16/F32.
+ * @param[in] k The value of `k`.
+ * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if
+ * input type is F32.
+ * @param[out] indices indices related to top k values. Data types supported: S32 if input type
+ * is U8/S16, F32 if input type is F32.
+ */
+ void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+ int total_bits = 32, int bits = 4);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ void run_on_cpu();
+ void run_on_gpu();
+ void run_on_gpu_single_quicksort();
+
+ uint32_t _k;
+ uint32_t _total_bits;
+ uint32_t _bits;
+ uint32_t _radix;
+ uint32_t _hist_buf_size;
+ uint32_t _glob_sum_buf_size;
+ uint32_t _n;
+
+ ICLTensor *_input;
+ ICLTensor *_values;
+ ICLTensor *_indices;
+
+ cl::Buffer _qs_idx_buf;
+ cl::Buffer _qs_temp_buf;
+ cl::Buffer _hist_buf;
+ cl::Buffer _glob_sum_buf;
+ cl::Buffer _temp_buf;
+ cl::Buffer _first_negative_idx_buf;
+ cl::Buffer _in_key_buf;
+ cl::Buffer _out_key_buf;
+ cl::Buffer _in_ind_buf;
+ cl::Buffer _out_ind_buf;
+
+ cl::Buffer *_p_in_key_buf;
+ cl::Buffer *_p_out_key_buf;
+ cl::Buffer *_p_in_ind_buf;
+ cl::Buffer *_p_out_ind_buf;
+
+ CLTopKV2Single _qs_kernel;
+ CLTopKV2Init _init_kernel;
+ CLRadixSortHistogram _hist_kernel;
+ CLRadixSortScanHistogram _scan_hist_kernel;
+ CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel;
+ CLRadixSortPasteHistogram _paste_hist_kernel;
+ CLRadixSortReorder _reorder_kernel;
+ CLTopKV2FindFirstNegative _find_first_negative_kernel;
+ CLTopKV2ReorderNegatives _reorder_negatives_kernel;
+ CLTopKV2Store _store_kernel;
+};
+}
+#endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/libs/ARMComputeEx/resolve_includes.py b/libs/ARMComputeEx/resolve_includes.py
new file mode 100644
index 000000000..b3e252892
--- /dev/null
+++ b/libs/ARMComputeEx/resolve_includes.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (c) 2016, 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import collections
+import os.path
+import re
+import subprocess
+import glob
+
+
+def resolve_includes(target, source):
+ # File collection
+ FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents')
+
+ # Include pattern
+ pattern = re.compile("#include \"(.*)\"")
+
+ # Get file contents
+ files = []
+ for i in range(len(source)):
+ src = source[i]
+ dst = target[i]
+ f = open(src)
+ cts = f.read()
+ f.close()
+ contents = cts.splitlines()
+ entry = FileEntry(target_name=dst, file_contents=contents)
+ files.append((os.path.basename(src), entry))
+
+ # Create dictionary of tupled list
+ files_dict = dict(files)
+
+ # Check for includes (can only be files in the same folder)
+ final_files = []
+ for file in files:
+ done = False
+ tmp_file = file[1].file_contents
+ print(file[1].target_name)
+ while not done:
+ file_count = 0
+ updated_file = []
+ for line in tmp_file:
+ found = pattern.search(line)
+ if found:
+ include_file = found.group(1)
+ data = files_dict[include_file].file_contents
+ updated_file.extend(data)
+ else:
+ updated_file.append(line)
+ file_count += 1
+
+ # Check if all include are replaced.
+ if file_count == len(tmp_file):
+ done = True
+
+ # Update temp file
+ tmp_file = updated_file
+
+ # Append and prepend string literal identifiers and add expanded file to final list
+ tmp_file.insert(0, "R\"(\n")
+ tmp_file.append("\n)\"")
+ entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file)
+ final_files.append((file[0], entry))
+
+ # Write output files
+ for file in final_files:
+ with open(file[1].target_name, 'w+') as out_file:
+ out_file.write("\n".join(file[1].file_contents))
+
+
+# Generate embed files
+cl_files = glob.glob('src/core/CL/cl_kernels/*.cl')
+cl_files += glob.glob('src/core/CL/cl_kernels/*.h')
+
+# DEBUG: print cl files
+print("cl_files:")
+print(cl_files)
+
+embed_files = [f + "embed" for f in cl_files]
+print("embed_files:")
+print(embed_files)
+
+resolve_includes(embed_files, cl_files)
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
new file mode 100644
index 000000000..d535c5da4
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
+ {"absdiff", "absdiff.cl"},
+ {"accumulate", "accumulate.cl"},
+ {"accumulate_squared", "accumulate.cl"},
+ {"accumulate_weighted", "accumulate.cl"},
+ {"activation_layer", "activation_layer.cl"},
+ {"activation_layer_qa8", "activation_layer_qa8.cl"},
+ {"activation_layer_logistic_qa8", "activation_layer_qa8.cl"},
+ {"arithmetic_add", "arithmetic_op.cl"},
+ {"arithmetic_sub", "arithmetic_op.cl"},
+ {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
+ {"batchnormalization_layer_nchw", "batchnormalization_layer.cl"},
+ {"batchnormalization_layer_nhwc", "batchnormalization_layer.cl"},
+ {"bitwise_or", "bitwise_op.cl"},
+ {"bitwise_and", "bitwise_op.cl"},
+ {"bitwise_xor", "bitwise_op.cl"},
+ {"bitwise_not", "bitwise_op.cl"},
+ {"cast", "cast.cl"},
+ {"cast_qasymm_in", "cast.cl"},
+ {"cast_qasymm_out", "cast.cl"},
+ {"channel_combine_NV", "channel_combine.cl"},
+ {"channel_combine_RGB888", "channel_combine.cl"},
+ {"channel_combine_RGBA8888", "channel_combine.cl"},
+ {"channel_combine_UYVY422", "channel_combine.cl"},
+ {"channel_combine_YUYV422", "channel_combine.cl"},
+ {"channel_shuffle_nchw", "channel_shuffle.cl"},
+ {"channel_extract_NV12", "channel_extract.cl"},
+ {"channel_extract_NV21", "channel_extract.cl"},
+ {"channel_extract_RGB888", "channel_extract.cl"},
+ {"channel_extract_RGBA8888", "channel_extract.cl"},
+ {"channel_extract_UYVY422", "channel_extract.cl"},
+ {"channel_extract_YUYV422", "channel_extract.cl"},
+ {"combine_gradients_L1", "canny.cl"},
+ {"combine_gradients_L2", "canny.cl"},
+ {"concatenate_depth", "concatenate.cl"},
+ {"concatenate_width", "concatenate.cl"},
+ {"convolution_rectangle", "convolution_rectangle.cl"},
+ {"col2im", "col2im.cl"},
+ {"convert_depth_down", "depth_convert.cl"},
+ {"convert_depth_up", "depth_convert.cl"},
+ {"convert_fc_weights", "convert_fc_weights.cl"},
+ {"convolution3x3_static", "convolution3x3.cl"},
+ {"convolution5x5_static", "convolution5x5.cl"},
+ {"convolution7x7_static", "convolution7x7.cl"},
+ {"convolution9x9_static", "convolution9x9.cl"},
+ {"convolution_separable1x5_static", "convolution5x5.cl"},
+ {"convolution_separable5x1_static", "convolution5x5.cl"},
+ {"convolution_separable1x7_static", "convolution7x7.cl"},
+ {"convolution_separable7x1_static", "convolution7x7.cl"},
+ {"convolution_separable1x9_static", "convolution9x9.cl"},
+ {"convolution_separable9x1_static", "convolution9x9.cl"},
+ {"copy_tensor", "copy_tensor.cl"},
+ {"copy_plane", "channel_extract.cl"},
+ {"copy_planes_3p", "channel_combine.cl"},
+ {"copy_to_keypoint", "fast_corners.cl"},
+ {"deconvolution_upsample", "deconvolution_layer.cl"},
+ {"depthwise_convolution_3x3", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_f16", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl"},
+ {"depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl"},
+ {"depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl"},
+ {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl"},
+ {"depthwise_im2col", "depthwise_convolution.cl"},
+ {"depthwise_vector_to_tensor", "depthwise_convolution.cl"},
+ {"depthwise_weights_reshape", "depthwise_convolution.cl"},
+ {"dequantization_layer", "dequantization_layer.cl"},
+ {"derivative", "derivative.cl"},
+ {"dilate", "dilate.cl"},
+ {"direct_convolution1x1", "direct_convolution1x1.cl"},
+ {"direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl"},
+ {"direct_convolution3x3", "direct_convolution3x3.cl"},
+ {"direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl"},
+ {"direct_convolution5x5", "direct_convolution5x5.cl"},
+ {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"},
+ {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
+ {"erode", "erode.cl"},
+ {"fast_corners", "fast_corners.cl"},
+ {"fill_image_borders_constant", "fill_border.cl"},
+ {"fill_image_borders_replicate", "fill_border.cl"},
+ {"finalize", "optical_flow_pyramid_lk.cl"},
+ {"floor_layer", "floor.cl"},
+ {"gather", "gather.cl"},
+ {"gather_1d", "gather.cl"},
+ {"gather_1d_out", "gather.cl"},
+ {"gaussian1x5_sub_x", "gaussian_pyramid.cl"},
+ {"gaussian5x1_sub_y", "gaussian_pyramid.cl"},
+ {"gemm_accumulate_biases", "gemm.cl"},
+ {"gemm_interleave4x4", "gemm.cl"},
+ {"gemm_ma_f16", "gemm.cl"},
+ {"gemm_ma_f32", "gemm.cl"},
+ {"gemm_ma_qs8", "gemm.cl"},
+ {"gemm_ma_qs16", "gemm.cl"},
+ {"gemm_mv", "gemv.cl"},
+ {"gemm_mv_quantized", "gemv.cl"},
+ {"gemm_mm_interleaved_transposed_f16", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_f32", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_qs8", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_qs16", "gemm.cl"},
+ {"gemm_mm_floating_point", "gemm.cl"},
+ {"gemm_mm_floating_point_f16_bifrost", "gemm.cl"},
+ {"gemm_mm_floating_point_f32_bifrost", "gemm.cl"},
+ {"gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl"},
+ {"gemm_mm_qs8", "gemm.cl"},
+ {"gemm_mm_qs16", "gemm.cl"},
+ {"gemm_lc_vm_f32", "gemm.cl"},
+ {"gemm_transpose1xW", "gemm.cl"},
+ {"gemmlowp_matrix_a_reduction", "gemmlowp.cl"},
+ {"gemmlowp_matrix_b_reduction", "gemmlowp.cl"},
+ {"gemmlowp_mm_bifrost", "gemmlowp.cl"},
+ {"gemmlowp_mm_midgard", "gemmlowp.cl"},
+ {"gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl"},
+ {"gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl"},
+ {"gemmlowp_offset_contribution", "gemmlowp.cl"},
+ {"gemmlowp_output_stage_quantize_down", "gemmlowp.cl"},
+ {"gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl"},
+ {"harris_score_3x3", "harris_corners.cl"},
+ {"harris_score_5x5", "harris_corners.cl"},
+ {"harris_score_7x7", "harris_corners.cl"},
+ {"hist_border_kernel", "histogram.cl"},
+ {"hist_border_kernel_fixed", "histogram.cl"},
+ {"hist_local_kernel", "histogram.cl"},
+ {"hist_local_kernel_fixed", "histogram.cl"},
+ {"hog_block_normalization", "hog.cl"},
+ {"hog_detector", "hog.cl"},
+ {"hog_orientation_binning", "hog.cl"},
+ {"hysteresis", "canny.cl"},
+ {"im2col1x1_stridex1_dchw", "im2col.cl"},
+ {"im2col3x3_dchw", "im2col.cl"},
+ {"im2col5x5_dchw", "im2col.cl"},
+ {"im2col11x11_padx0_pady0_dchw", "im2col.cl"},
+ {"im2col_generic_dchw", "im2col.cl"},
+ {"im2col_generic_padx0_pady0_dchw", "im2col.cl"},
+ {"im2col_reduced_dchw", "im2col.cl"},
+ {"init_level", "optical_flow_pyramid_lk.cl"},
+ {"init_level_max", "optical_flow_pyramid_lk.cl"},
+ {"init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl"},
+ {"integral_horizontal", "integral_image.cl"},
+ {"integral_vertical", "integral_image.cl"},
+ {"IYUV_to_NV12_bt709", "color_convert.cl"},
+ {"IYUV_to_RGB888_bt709", "color_convert.cl"},
+ {"IYUV_to_RGBA8888_bt709", "color_convert.cl"},
+ {"IYUV_to_YUV444_bt709", "color_convert.cl"},
+ {"l2_normalize", "l2_normalize.cl"},
+ {"lktracker_stage0", "optical_flow_pyramid_lk.cl"},
+ {"lktracker_stage1", "optical_flow_pyramid_lk.cl"},
+ {"magnitude_phase", "magnitude_phase.cl"},
+ {"mean_stddev_accumulate", "mean_stddev.cl"},
+ {"minmax", "minmaxloc.cl"},
+ {"minmax_border", "minmaxloc.cl"},
+ {"minmax_layer", "minmax_layer.cl"},
+ {"minmaxloc", "minmaxloc.cl"},
+ {"non_linear_filter_box3x3", "non_linear_filter3x3.cl"},
+ {"non_linear_filter_cross3x3", "non_linear_filter3x3.cl"},
+ {"non_linear_filter_disk3x3", "non_linear_filter3x3.cl"},
+ {"non_linear_filter_box5x5", "non_linear_filter5x5.cl"},
+ {"non_linear_filter_cross5x5", "non_linear_filter5x5.cl"},
+ {"non_linear_filter_disk5x5", "non_linear_filter5x5.cl"},
+ {"non_max_suppression", "nonmax.cl"},
+ {"normalization_layer_cross_map", "normalization_layer.cl"},
+ {"normalization_layer_in_map", "normalization_layer.cl"},
+ {"NV12_to_IYUV_bt709", "color_convert.cl"},
+ {"NV12_to_RGB888_bt709", "color_convert.cl"},
+ {"NV12_to_RGBA8888_bt709", "color_convert.cl"},
+ {"NV12_to_YUV444_bt709", "color_convert.cl"},
+ {"NV21_to_IYUV_bt709", "color_convert.cl"},
+ {"NV21_to_RGB888_bt709", "color_convert.cl"},
+ {"NV21_to_RGBA8888_bt709", "color_convert.cl"},
+ {"NV21_to_YUV444_bt709", "color_convert.cl"},
+ {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
+ {"permute_201", "permute.cl"},
+ {"permute_120", "permute.cl"},
+ {"permute_3201", "permute.cl"},
+ {"pixelwise_mul_float", "pixelwise_mul_float.cl"},
+ {"pixelwise_mul_int", "pixelwise_mul_int.cl"},
+ {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
+ {"pixelwise_div_float", "pixelwise_div_float.cl"},
+ {"pixelwise_div_int", "pixelwise_div_int.cl"},
+ {"pooling_layer_2", "pooling_layer.cl"},
+ {"pooling_layer_3", "pooling_layer.cl"},
+ {"pooling_layer_optimized_3", "pooling_layer.cl"},
+ {"pooling_layer_7", "pooling_layer.cl"},
+ {"pooling_layer_MxN_nchw", "pooling_layer.cl"},
+ {"pooling_layer_MxN_nhwc", "pooling_layer.cl"},
+ {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"},
+ {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"},
+ {"quantization_layer", "quantization_layer.cl"},
+ {"reduce_max", "reduce_max.cl"},
+ {"reduction_operation", "reduction_operation.cl"},
+ {"reduction_mean", "reduction_mean.cl"},
+ {"remap_nearest_neighbour", "remap.cl"},
+ {"remap_bilinear", "remap.cl"},
+ {"reshape_layer", "reshape_layer.cl"},
+ {"reshape_to_columns", "convolution_layer.cl"},
+ {"RGB888_to_IYUV_bt709", "color_convert.cl"},
+ {"RGB888_to_NV12_bt709", "color_convert.cl"},
+ {"RGB888_to_RGBA8888_bt709", "color_convert.cl"},
+ {"RGB888_to_YUV444_bt709", "color_convert.cl"},
+ {"RGBA8888_to_IYUV_bt709", "color_convert.cl"},
+ {"RGBA8888_to_NV12_bt709", "color_convert.cl"},
+ {"RGBA8888_to_RGB888_bt709", "color_convert.cl"},
+ {"RGBA8888_to_YUV444_bt709", "color_convert.cl"},
+ {"roi_pooling_layer", "roi_pooling_layer.cl"},
+ {"scale_nearest_neighbour", "scale.cl"},
+ {"scale_bilinear", "scale.cl"},
+ {"scharr3x3", "scharr_filter.cl"},
+ {"sobel3x3", "sobel_filter.cl"},
+ {"sobel_separable5x1", "sobel_filter.cl"},
+ {"sobel_separable1x5", "sobel_filter.cl"},
+ {"sobel_separable7x1", "sobel_filter.cl"},
+ {"sobel_separable1x7", "sobel_filter.cl"},
+ {"softmax_layer_norm", "softmax_layer.cl"},
+ {"softmax_layer_norm_quantized", "softmax_layer_quantized.cl"},
+ {"softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl"},
+ {"softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl"},
+ {"softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl"},
+ {"softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl"},
+ {"strided_slice", "strided_slice.cl"},
+ {"suppress_non_maximum", "canny.cl"},
+ {"tablelookup_U8", "tablelookup.cl"},
+ {"tablelookup_S16", "tablelookup.cl"},
+ {"threshold_binary", "threshold.cl"},
+ {"threshold_range", "threshold.cl"},
+ {"transpose", "transpose.cl"},
+ {"UYVY422_to_IYUV_bt709", "color_convert.cl"},
+ {"UYVY422_to_NV12_bt709", "color_convert.cl"},
+ {"UYVY422_to_RGB888_bt709", "color_convert.cl"},
+ {"UYVY422_to_RGBA8888_bt709", "color_convert.cl"},
+ {"warp_affine_nearest_neighbour", "warp_affine.cl"},
+ {"warp_affine_bilinear", "warp_affine.cl"},
+ {"warp_perspective_nearest_neighbour", "warp_perspective.cl"},
+ {"warp_perspective_bilinear", "warp_perspective.cl"},
+ {"winograd_filter_transform_2x2_3x3_nchw", "winograd.cl"},
+ {"winograd_filter_transform_4x4_3x3_nchw", "winograd.cl"},
+ {"winograd_filter_transform_4x4_5x5_nchw", "winograd.cl"},
+ {"winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl"},
+ {"winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl"},
+ {"winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl"},
+ {"winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl"},
+ {"winograd_output_transform_2x2_3x3_nchw", "winograd.cl"},
+ {"winograd_output_transform_4x4_3x3_nchw", "winograd.cl"},
+ {"winograd_output_transform_4x4_5x5_nchw", "winograd.cl"},
+ {"YUYV422_to_IYUV_bt709", "color_convert.cl"},
+ {"YUYV422_to_NV12_bt709", "color_convert.cl"},
+ {"YUYV422_to_RGB888_bt709", "color_convert.cl"},
+ {"YUYV422_to_RGBA8888_bt709", "color_convert.cl"},
+ {"topkv2_init", "topkv2.cl"},
+ {"topkv2_find_first_negative", "topkv2.cl"},
+ {"topkv2_reorder_negatives", "topkv2.cl"},
+ {"topkv2_store", "topkv2.cl"},
+ {"radixsort_histogram", "topkv2_radixsort.cl"},
+ {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
+ {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
+ {"radixsort_reorder", "topkv2_radixsort.cl"},
+ {"topkv2_quicksort", "topkv2_quicksort.cl"},
+};
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
+#ifdef EMBEDDED_KERNELS
+ {
+ "cast.cl",
+#include "./cl_kernels/cast.clembed"
+ },
+ {
+ "fixed_point.h",
+#include "./cl_kernels/fixed_point.hembed"
+ },
+ {
+ "gather.cl",
+#include "./cl_kernels/gather.clembed"
+ },
+ {
+ "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+ },
+ {
+ "helpers_asymm.h",
+#include "./cl_kernels/helpers_asymm.hembed"
+ },
+ {
+ "pixelwise_div_float.cl",
+#include "./cl_kernels/pixelwise_div_float.clembed"
+ },
+ {
+ "pixelwise_div_int.cl",
+#include "./cl_kernels/pixelwise_div_int.clembed"
+ },
+ {
+ "reduce_max.cl",
+#include "./cl_kernels/reduce_max.clembed"
+ },
+ {
+ "reduction_mean.cl",
+#include "./cl_kernels/reduction_mean.clembed"
+ },
+ {
+ "strided_slice.cl",
+#include "./cl_kernels/strided_slice.clembed"
+ },
+ {
+ "topkv2.cl",
+#include "./cl_kernels/topkv2.clembed"
+ },
+ {
+ "topkv2_radixsort.cl",
+#include "./cl_kernels/topkv2_radixsort.clembed"
+ },
+ {
+ "topkv2_quicksort.cl",
+#include "./cl_kernels/topkv2_quicksort.clembed"
+ },
+#endif /* EMBEDDED_KERNELS */
+};
+
+CLKernelLibraryEx::CLKernelLibraryEx()
+ : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+{
+ opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
+ // CLKernelLibrary is built
+}
+
+CLKernelLibraryEx &CLKernelLibraryEx::get()
+{
+ static CLKernelLibraryEx _kernel_library;
+ return _kernel_library;
+}
+
+Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
+ const StringSet &build_options_set) const
+{
+ // Find which program contains the kernel
+ auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+ if (_kernel_program_map.end() == kernel_program_it)
+ {
+ ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+ }
+ std::string concat_str;
+
+ if (fp16_supported(_device))
+ {
+ concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
+ }
+
+ if (get_cl_version(_device) == CLVersion::CL20)
+ {
+ concat_str += " -cl-std=CL2.0 ";
+ }
+ else if (arm_non_uniform_workgroup_supported(_device))
+ {
+ concat_str += " -cl-arm-non-uniform-work-group-size ";
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
+ }
+
+ // Check if the program has been built before with same build options.
+ const std::string program_name = kernel_program_it->second;
+ const std::string build_options = stringify_set(build_options_set) + concat_str;
+
+ const std::string built_program_name = program_name + "_" + build_options;
+ auto built_program_it = _built_programs_map.find(built_program_name);
+
+ cl::Program cl_program;
+
+ if (_built_programs_map.end() != built_program_it)
+ {
+ // If program has been built, retrieve to create kernel from it
+ cl_program = built_program_it->second;
+ }
+ else
+ {
+ // Get program
+ Program program = load_program(program_name);
+
+ // Build program
+ cl_program = program.build(build_options);
+
+ // Add built program to internal map
+ _built_programs_map.emplace(built_program_name, cl_program);
+ }
+
+ // Create and return kernel
+ return Kernel(kernel_name, cl_program);
+}
+
+void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
+ cl::Program program)
+{
+ _built_programs_map.emplace(built_program_name, program);
+}
+
+const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
+{
+ const auto program_it = _programs_map.find(program_name);
+
+ if (program_it != _programs_map.end())
+ {
+ return program_it->second;
+ }
+
+ Program program;
+
+#ifdef EMBEDDED_KERNELS
+ const auto program_source_it = _program_source_map.find(program_name);
+
+ if (_program_source_map.end() == program_source_it)
+ {
+ ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+ }
+
+ program = Program(_context, program_name, program_source_it->second);
+#else /* EMBEDDED_KERNELS */
+ // Check for binary
+ std::string source_name = _kernel_path + program_name;
+ std::string binary_name = source_name + "bin";
+
+ if (std::ifstream(binary_name).is_open())
+ {
+ const std::string program_binary = read_file(binary_name, true);
+ program = Program(_context, _device, program_name,
+ std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
+ }
+ else if (std::ifstream(source_name).is_open())
+ {
+ program = Program(_context, program_name, read_file(source_name, false));
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
+ }
+#endif /* EMBEDDED_KERNELS */
+
+ // Insert program to program map
+ const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+ return new_program.first->second;
+}
+
+std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const
+{
+ std::string concat_set;
+
+#ifndef EMBEDDED_KERNELS
+ concat_set += "-I" + _kernel_path + " ";
+#endif /* EMBEDDED_KERNELS */
+
+ // Concatenate set
+ for (const auto &el : s)
+ {
+ concat_set += " " + el;
+ }
+
+ return concat_set;
+}
+
+std::string CLKernelLibraryEx::get_program_source(const std::string &program_name)
+{
+ const auto program_source_it = _program_source_map.find(program_name);
+
+ if (program_source_it == _program_source_map.end())
+ {
+ ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+ }
+
+ return program_source_it->second;
+}
+
+size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const
+{
+ size_t result;
+
+ size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+ ARM_COMPUTE_ERROR_ON_MSG(
+ err != 0,
+ "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+ ARM_COMPUTE_UNUSED(err);
+
+ return result;
+}
+
+cl::NDRange CLKernelLibraryEx::default_ndrange() const
+{
+ cl::Device device = cl::Device::getDefault();
+ GPUTarget _target = get_target_from_device(device);
+ cl::NDRange default_range;
+
+ switch (_target)
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::T600:
+ case GPUTarget::T700:
+ case GPUTarget::T800:
+ default_range = cl::NDRange(128u, 1);
+ break;
+ default:
+ default_range = cl::NullRange;
+ }
+
+ return default_range;
+}
+
+std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); }
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
new file mode 100644
index 000000000..0c0a9ede6
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif /* FIXED_POINT_POSITION */
+
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * The following computations will be performed:
+ *
+ * -# Add offset terms to inputs
+ -# Get scaled value of two inputs
+ * -# Add inputs
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
+ * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
+ * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
+ * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void arithmetic_add_qasymm8(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ VEC_DATA_TYPE(int, 16)
+ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+ VEC_DATA_TYPE(int, 16)
+ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+ // Get scaled value of two inputs
+ VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+ VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+
+ VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
+ VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
+ VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
+
+ VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
+ VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
+
+ // Add inputs and multiply with a multiplier smaller than 1
+ VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
+ VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+ out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+ VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+// TODO: Apply min-max BOUND to support fuse with relu.
+/*
+#if defined(MIN_BOUND)
+ res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+*/
+
+ // Store result
+ VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
+ 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
new file mode 100644
index 000000000..113804cca
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef SCALE_IN
+#define SCALE_IN 1.0f
+#endif
+#ifndef OFFSET_IN
+#define OFFSET_IN 0
+#endif
+
+/** Perform a cast operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+ 0, (__global DATA_TYPE_OUT *)output.ptr);
+}
+
+
+/** Perform a cast operation on an QASYMM8 input tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast_qasymm_in(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+ VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
+ VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+
+ VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
+ VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
+
+ VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+ 0, (__global DATA_TYPE_OUT *)output.ptr);
+}
+
+
+/** Perform a cast operation on an QASYMM8 output tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: U8
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast_qasymm_out(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+ VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
+ VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+
+ VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
+ VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
+
+ VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+ 0, (__global DATA_TYPE_OUT *)output.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
new file mode 100644
index 000000000..7807533e2
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_FIXED_POINT_H
+#define ARM_COMPUTE_FIXED_POINT_H
+
+#define TYPE_ALIAS(type, alias) \
+ typedef type alias; \
+ typedef type alias##x##1; \
+ typedef type##2 alias##x##2; \
+ typedef type##3 alias##x##3; \
+ typedef type##4 alias##x##4; \
+ typedef type##8 alias##x##8; \
+ typedef type##16 alias##x##16;
+
+TYPE_ALIAS(char, qs8)
+TYPE_ALIAS(short, qs16)
+TYPE_ALIAS(int, qs32)
+
+#define qs8_MIN ((char)CHAR_MIN)
+#define qs8_MAX ((char)CHAR_MAX)
+#define qs16_MIN ((short)SHRT_MIN)
+#define qs16_MAX ((short)SHRT_MAX)
+#define qs32_MIN ((int)INT_MIN)
+#define qs32_MAX ((int)INT_MAX)
+
+#define qu8_MIN ((uchar)0)
+#define qu8_MAX ((uchar)UCHAR_MAX)
+#define qu16_MIN ((ushort)0)
+#define qu16_MAX ((ushort)USHRT_MAX)
+#define qu32_MIN ((uint)0)
+#define qu32_MAX ((uint)UINT_MAX)
+
+#define qs8_TYPE char
+#define qs8x1_TYPE char
+#define qs8x2_TYPE char2
+#define qs8x3_TYPE char3
+#define qs8x4_TYPE char4
+#define qs8x8_TYPE char8
+#define qs8x16_TYPE char16
+
+#define qs16_TYPE short
+#define qs16x1_TYPE short
+#define qs16x2_TYPE short2
+#define qs16x3_TYPE short3
+#define qs16x4_TYPE short4
+#define qs16x8_TYPE short8
+#define qs16x16_TYPE short16
+
+#define qs32_TYPE int
+#define qs32x1_TYPE int
+#define qs32x2_TYPE int2
+#define qs32x3_TYPE int3
+#define qs32x4_TYPE int4
+#define qs32x8_TYPE int8
+#define qs32x16_TYPE int16
+
+/* All internal constants are represented in the maximum supported fixed point format (QS16),
+ * thus we define an additional shift parameter required to convert the constant
+ * from the maximum supported format to the require one.
+ */
+#define qs8_SHIFT 8
+#define qs16_SHIFT 0
+
+#undef VEC_DATA_TYPE_STR
+#undef VEC_DATA_TYPE
+#undef CONVERT_STR
+#undef CONVERT
+#undef CONVERT_SAT_STR
+#undef CONVERT_SAT
+
+#define VEC_DATA_TYPE_STR(type, size) type##x##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x)))
+#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype)
+#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE)
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x)))
+#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype)
+#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE)
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+/** Computes saturating absolute value of fixed point vector.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point absolute value.
+ */
+#define ABSQ_SAT_IMPL(type) \
+ inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); }
+
+ABSQ_SAT_IMPL(qs8x16)
+ABSQ_SAT_IMPL(qs16x8)
+
+#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a))
+#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
+
+/** Computes max of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point maximum.
+ */
+#define MAXQ_IMPL(type) \
+ inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); }
+
+MAXQ_IMPL(qs8x1)
+MAXQ_IMPL(qs8x2)
+MAXQ_IMPL(qs8x4)
+MAXQ_IMPL(qs8x8)
+MAXQ_IMPL(qs8x16)
+MAXQ_IMPL(qs16x1)
+MAXQ_IMPL(qs16x2)
+MAXQ_IMPL(qs16x4)
+MAXQ_IMPL(qs16x8)
+MAXQ_IMPL(qs16x16)
+
+#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
+#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
+
+/** Computes saturated addition of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point addition. The result is saturated in case of overflow
+ */
+#define ADDQ_SAT_IMPL(type) \
+ inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); }
+
+ADDQ_SAT_IMPL(qs8x1)
+ADDQ_SAT_IMPL(qs8x2)
+ADDQ_SAT_IMPL(qs8x4)
+ADDQ_SAT_IMPL(qs8x8)
+ADDQ_SAT_IMPL(qs8x16)
+ADDQ_SAT_IMPL(qs16x1)
+ADDQ_SAT_IMPL(qs16x2)
+ADDQ_SAT_IMPL(qs16x4)
+ADDQ_SAT_IMPL(qs16x8)
+ADDQ_SAT_IMPL(qs16x16)
+ADDQ_SAT_IMPL(qs32x1)
+ADDQ_SAT_IMPL(qs32x2)
+ADDQ_SAT_IMPL(qs32x4)
+ADDQ_SAT_IMPL(qs32x8)
+ADDQ_SAT_IMPL(qs32x16)
+
+#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b))
+#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
+
+/** Computes saturated subtraction of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point subtraction. The result is saturated in case of overflow
+ */
+#define SUBQ_SAT_IMPL(type) \
+ inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); }
+
+SUBQ_SAT_IMPL(qs8x1)
+SUBQ_SAT_IMPL(qs8x2)
+SUBQ_SAT_IMPL(qs8x4)
+SUBQ_SAT_IMPL(qs8x8)
+SUBQ_SAT_IMPL(qs8x16)
+SUBQ_SAT_IMPL(qs16x1)
+SUBQ_SAT_IMPL(qs16x2)
+SUBQ_SAT_IMPL(qs16x4)
+SUBQ_SAT_IMPL(qs16x8)
+SUBQ_SAT_IMPL(qs16x16)
+
+#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
+#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
+
+/* Multiply of two fixed point numbers
+ *
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiplication.
+ */
+#define MULQ_IMPL(type, itype) \
+ inline type mul_##type(type VopA, type VopB, int fixed_point_position) \
+ { \
+ itype round_val = (itype)(1 << (fixed_point_position - 1)); \
+ itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \
+ return CONVERT((res >> (itype)fixed_point_position), type); \
+ }
+
+MULQ_IMPL(qs8x8, qs16x8)
+MULQ_IMPL(qs16x8, qs32x8)
+MULQ_IMPL(qs8x16, qs16x16)
+MULQ_IMPL(qs16x16, qs32x16)
+
+#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position))
+#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position)
+
+/* Saturate multiply of two fixed point numbers
+ *
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiplication. The result is saturated in case of overflow
+ */
+#define MULQ_SAT_IMPL(type, itype) \
+ inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \
+ { \
+ itype round_val = (itype)(1 << (fixed_point_position - 1)); \
+ itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \
+ return CONVERT_SAT((res >> (itype)fixed_point_position), type); \
+ }
+
+MULQ_SAT_IMPL(qs8x1, qs16x1)
+MULQ_SAT_IMPL(qs8x2, qs16x2)
+MULQ_SAT_IMPL(qs8x3, qs16x3)
+MULQ_SAT_IMPL(qs8x4, qs16x4)
+MULQ_SAT_IMPL(qs8x8, qs16x8)
+MULQ_SAT_IMPL(qs8x16, qs16x16)
+MULQ_SAT_IMPL(qs16x1, qs32x1)
+MULQ_SAT_IMPL(qs16x2, qs32x2)
+MULQ_SAT_IMPL(qs16x3, qs32x3)
+MULQ_SAT_IMPL(qs16x4, qs32x4)
+MULQ_SAT_IMPL(qs16x8, qs32x8)
+MULQ_SAT_IMPL(qs16x16, qs32x16)
+
+#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \
+ mul_sat_##type##x##size((a), (b), (position))
+#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \
+ MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
+
+/** Saturate multiply-accumulate
+ *
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiply-accumulate. The result is saturated in case of
+ * overflow
+ */
+#define MLAQ_SAT_IMPL(type, itype) \
+ type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \
+ { \
+ itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \
+ (itype)(1 << (fixed_point_position - 1))); \
+ return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \
+ }
+
+MLAQ_SAT_IMPL(qs8x8, qs16x8)
+MLAQ_SAT_IMPL(qs8x16, qs16x16)
+MLAQ_SAT_IMPL(qs16x8, qs32x8)
+
+#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
+ mla_sat_##type##x##size((a), (b), (c), (position))
+#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \
+ MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+
+/** Saturate multiply-accumulate long
+ *
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiply-accumulate long. The result is saturated in case
+ * of overflow
+ */
+#define MLALQ_SAT_IMPL(type, itype) \
+ itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \
+ { \
+ itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \
+ (itype)(1 << (fixed_point_position - 1))); \
+ return add_sat(VopA, res >> (itype)fixed_point_position); \
+ }
+
+MLALQ_SAT_IMPL(qs8x8, qs16x8)
+MLALQ_SAT_IMPL(qs16x8, qs32x8)
+
+#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
+ mlal_sat_##type##x##size((a), (b), (c), (position))
+#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \
+ MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+
+/** Saturate division of two fixed point vectors
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point division. The result is saturated in case of overflow
+ */
+#define DIVQ_SAT_IMPL(stype, type, itype) \
+ inline type div_sat_##type(type VopA, type VopB, int fixed_point_position) \
+ { \
+ itype conv_a = CONVERT((VopA), itype); \
+ itype denominator = CONVERT((VopB), itype); \
+ itype numerator = conv_a << (itype)(fixed_point_position); \
+ itype res = select((itype)(numerator / denominator), \
+ select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \
+ (itype)(denominator == (itype)0)); \
+ return CONVERT_SAT((res), type); \
+ }
+
+DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
+DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8)
+DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
+DIVQ_SAT_IMPL(qs8, qs8, qs16)
+DIVQ_SAT_IMPL(qs16, qs16, qs32)
+
+#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position))
+#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position)
+
+#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \
+ div_sat_##type##x##size((a), (b), (position))
+#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \
+ DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
+
+/** Saturate exponential of a fixed point vector
+ *
+ * @note Implemented approach uses taylor polynomial to approximate the exponential function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] size the number of the calculated elements.
+ *
+ * @return The result of the fixed point exponential. The result is saturated in case of overflow
+ */
+#define EXPQ_IMPL(stype, type, size) \
+ inline type exp_sat_##type(type VopA, int fixed_point_position) \
+ { \
+ type const_one = (type)(1 << (fixed_point_position)); \
+ type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1); \
+ type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one; \
+ type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1); \
+ type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1); \
+ type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1); \
+ type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1); \
+ type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position); \
+ type dec_m = m >> (type)fixed_point_position; \
+ type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, \
+ fixed_point_position); \
+ alpha = CONVERT(abs_diff(VopA, alpha), type); \
+ type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \
+ return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), \
+ clz(sum) > dec_m); /* Saturate result if needed */ \
+ }
+
+EXPQ_IMPL(qs8, qs8x2, 2)
+EXPQ_IMPL(qs8, qs8x4, 4)
+EXPQ_IMPL(qs8, qs8x8, 8)
+EXPQ_IMPL(qs8, qs8x16, 16)
+EXPQ_IMPL(qs16, qs16x2, 2)
+EXPQ_IMPL(qs16, qs16x4, 4)
+EXPQ_IMPL(qs16, qs16x8, 8)
+EXPQ_IMPL(qs16, qs16x16, 16)
+
+#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
+#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate logarithm of a fixed point vector
+ *
+ * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] size the number of the calculated elements.
+ *
+ * @return The result of the fixed point logarithm. The result is saturated in case of overflow
+ */
+#define LOGQ_IMPL(stype, type, size) \
+ inline type log_sat_##type(type VopA, int fixed_point_position) \
+ { \
+ type const_one = (type)(1 << (fixed_point_position)); \
+ type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */ \
+ type A = (type)(0x5C0F >> (14 - fixed_point_position)); /* 1.4384189 */ \
+ type B = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */ \
+ type C = (type)(0x2933 >> (15 - fixed_point_position)); /* 0.3218538 */ \
+ type D = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */ \
+ type inter_a = \
+ select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \
+ VopA < const_one); \
+ type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position); \
+ inter_a = inter_a >> shift_val; \
+ inter_a = sub_sat(inter_a, const_one); \
+ type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A); \
+ sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position); \
+ sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, \
+ size, fixed_point_position); \
+ return select(select(sum, -sum, VopA < const_one), (type)0, \
+ VopA < (type)0); /* Saturate result if needed */ \
+ }
+
+LOGQ_IMPL(qs8, qs8x16, 16)
+LOGQ_IMPL(qs16, qs16x8, 8)
+LOGQ_IMPL(qs16, qs16x16, 16)
+
+#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position))
+#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate inverse square root of a fixed point vector
+ *
+ * @note Implemented approach uses Newton's method to approximate the inverse square root function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] size the number of the calculated elements.
+ *
+ * @return The result of the fixed point inverse square root. The result is saturated in case of
+ * overflow
+ */
+#define INVSQRTQ_IMPL(stype, type, size) \
+ inline type invsqrt_sat_##type(type VopA, int fixed_point_position) \
+ { \
+ type const_three = (type)(3 << (fixed_point_position)); \
+ type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \
+ type temp = select((type)(VopA >> shift_value), \
+ select((type)stype##_MAX, (type)(VopA << (-shift_value)), \
+ (type)(clz(VopA) > (-shift_value))), \
+ (type)(shift_value < (type)0)); \
+ type x = temp; \
+ x = MUL_SAT_OP_EXPAND( \
+ x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
+ fixed_point_position), \
+ temp, stype, size, fixed_point_position)), \
+ stype, size, fixed_point_position) >> \
+ 1; \
+ x = MUL_SAT_OP_EXPAND( \
+ x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
+ fixed_point_position), \
+ temp, stype, size, fixed_point_position)), \
+ stype, size, fixed_point_position) >> \
+ 1; \
+ x = MUL_SAT_OP_EXPAND( \
+ x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
+ fixed_point_position), \
+ temp, stype, size, fixed_point_position)), \
+ stype, size, fixed_point_position) >> \
+ 1; \
+ if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */ \
+ { \
+ x = MUL_SAT_OP_EXPAND( \
+ x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
+ fixed_point_position), \
+ temp, stype, size, fixed_point_position)), \
+ stype, size, fixed_point_position) >> \
+ 1; \
+ x = MUL_SAT_OP_EXPAND( \
+ x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
+ fixed_point_position), \
+ temp, stype, size, fixed_point_position)), \
+ stype, size, fixed_point_position) >> \
+ 1; \
+ } \
+ type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \
+ return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), \
+ (type)(clz(x) > shift_value2)), \
+ (type)(shift_value < (type)0)); /* Saturate result if needed */ \
+ }
+
+INVSQRTQ_IMPL(qs8, qs8x1, 1)
+INVSQRTQ_IMPL(qs16, qs16x1, 1)
+INVSQRTQ_IMPL(qs8, qs8x16, 16)
+INVSQRTQ_IMPL(qs16, qs16x8, 8)
+
+#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position))
+#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate hyperbolic tangent of a fixed point vector
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] size the number of the calculated elements.
+ *
+ * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of
+ * overflow
+ */
+#define TANHQ_IMPL(stype, type, size) \
+ inline type tanh_sat_##type(type VopA, int fixed_point_position) \
+ { \
+ type const_one = (type)(1 << (fixed_point_position)); \
+ type const_two = (type)(2 << (fixed_point_position)); \
+ type exp2x = \
+ EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \
+ stype, size, fixed_point_position); \
+ type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size); \
+ type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size); \
+ return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position); \
+ }
+
+TANHQ_IMPL(qs8, qs8x16, 16)
+TANHQ_IMPL(qs16, qs16x8, 8)
+
+#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position))
+#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position)
+
+#define floatx16 float16
+#define float16_TYPE float16
+
+#define CONVERTQ_DOWN_IMPL(in_type, out_type) \
+ inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
+ { \
+ return CONVERT(a * (1 << fixed_point_position) + \
+ select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
+ out_type); \
+ }
+
+CONVERTQ_DOWN_IMPL(float16, qs8x16)
+CONVERTQ_DOWN_IMPL(float16, qs16x16)
+
+#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type) \
+ inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \
+ { \
+ return CONVERT_SAT(a * (1 << fixed_point_position) + \
+ select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
+ out_type); \
+ }
+
+CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)
+CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16)
+
+#define CONVERTQ_UP_IMPL(in_type, out_type) \
+ inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
+ { \
+ return CONVERT(a, out_type) / (1 << fixed_point_position); \
+ }
+
+CONVERTQ_UP_IMPL(qs8x16, float16)
+CONVERTQ_UP_IMPL(qs16x16, float16)
+
+#define SQCVT_SAT_IMPL(type) \
+ inline type sqcvt_##type##_sat(float a, int fixed_point_position) \
+ { \
+ return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \
+ }
+
+SQCVT_SAT_IMPL(qs8)
+SQCVT_SAT_IMPL(qs16)
+
+#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position))
+#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position)
+
+#endif // ARM_COMPUTE_FIXED_POINT_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
new file mode 100644
index 000000000..25e20f5f2
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform gather
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in] input1_ptr Pointer to the first source tensor. Supported data types: U8/S32/F32
+ * @param[in] input1_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input1_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input1_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input1_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] input2_ptr Pointer to the first source tensor. Supported data types: U32
+ * @param[in] input2_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input2_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gather(IMAGE_DECLARATION(input1),
+ VECTOR_DECLARATION(input2),
+ IMAGE_DECLARATION(output))
+{
+ Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
+ Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2);
+ Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
+ in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
+
+ //TODO: performance tuning for memcopy
+ int index = in2_data.s0;
+ int stride=input1_stride_y/input1_stride_x;
+
+ for(int i=0; i<stride; i++){
+ *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index));
+ }
+}
+
+__kernel void gather_1d_out(IMAGE_DECLARATION(input1),
+ VECTOR_DECLARATION(input2),
+ VECTOR_DECLARATION(output))
+{
+ Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
+ Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2);
+ Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
+ in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
+
+ //TODO: performance tuning for memcopy
+ int index = in2_data.s0;
+ int stride=input1_stride_y/input1_stride_x;
+
+ for(int i=0; i<stride; i++){
+ *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index));
+ }
+}
+
+__kernel void gather_1d(VECTOR_DECLARATION(input1),
+ VECTOR_DECLARATION(input2),
+ VECTOR_DECLARATION(output))
+{
+ Vector in1 = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1);
+ Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2);
+ Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
+ in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
+
+ //TODO: performance tuning for memcopy
+ int index = in2_data.s0;
+ *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
new file mode 100644
index 000000000..8143d2398
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+#if defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(cl_arm_printf)
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+
+#define EXPAND(x) x
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
+ uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+ uint name##_step_y, uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+ uint name##_step_y, uint name##_stride_z, uint name##_step_z, \
+ uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+ uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \
+ uint name##_step_w, uint name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+ name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+ name##_stride_x, name##_step_x, name##_stride_y, \
+ name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+ name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
+ name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+ name##_stride_x, name##_step_x, name##_stride_y, \
+ name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name) \
+ update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
+ name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+ update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \
+ update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
+ name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+ update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \
+ mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+ int stride_y; /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+ int stride_y; /**< Stride of the image in Y dimension (in bytes) */
+ int stride_z; /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+ int stride_y; /**< Stride of the image in Y dimension (in bytes) */
+ int stride_z; /**< Stride of the image in Z dimension (in bytes) */
+ int stride_w; /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+ uint stride_x, uint step_x)
+{
+ Vector vector = {
+ .ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ };
+ vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+ return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+ uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+ Image img = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y};
+ img.ptr +=
+ img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+ return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes,
+ uint stride_x, uint step_x, uint stride_y,
+ uint step_y, uint stride_z, uint step_z)
+{
+ Image img = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y};
+ img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
+ get_global_id(1) * step_y + get_global_id(2) * step_z;
+ return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes, uint stride_x,
+ uint step_x, uint stride_y, uint step_y, uint stride_z,
+ uint step_z)
+{
+ Tensor3D tensor = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z};
+ tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+ get_global_id(1) * step_y + get_global_id(2) * step_z;
+ return tensor;
+}
+
+Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes, uint stride_x,
+ uint step_x, uint stride_y, uint step_y, uint stride_z,
+ uint step_z, uint stride_w, uint step_w, uint mod_size)
+{
+ Tensor4D tensor = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z,
+ .stride_w = stride_w};
+
+ tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+ get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
+ (get_global_id(2) / mod_size) * step_w;
+ return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ */
+__global inline const uchar *vector_offset(const Vector *vec, int x)
+{
+ return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ */
+__global inline uchar *offset(const Image *img, int x, int y)
+{
+ return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] z Relative Z position
+ */
+__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+ return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] z Relative Z position
+ * @param[in] w Relative W position
+ */
+__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+ return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+ w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
new file mode 100644
index 000000000..c39138caa
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+#include "helpers.h"
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+ { \
+ VEC_DATA_TYPE(int, size) \
+ mask = (1 << exponent) - 1; \
+ const VEC_DATA_TYPE(int, size) zero = 0; \
+ const VEC_DATA_TYPE(int, size) one = 1; \
+ VEC_DATA_TYPE(int, size) \
+ threshold = (mask >> 1) + select(zero, one, x < 0); \
+ return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
+ }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+ { \
+ VEC_DATA_TYPE(int, size) \
+ overflow = a == b && a == INT_MIN; \
+ VEC_DATA_TYPE(long, size) \
+ a_64 = convert_long##size(a); \
+ VEC_DATA_TYPE(long, size) \
+ b_64 = convert_long##size(b); \
+ VEC_DATA_TYPE(long, size) \
+ ab_64 = a_64 * b_64; \
+ /* COMPMID-907 */ \
+ VEC_DATA_TYPE(int, size) \
+ ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \
+ return select(ab_x2_high32, INT_MAX, overflow); \
+ }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
+ a) \
+ { \
+ const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \
+ const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \
+ const int k_fractional_bits = 31; \
+ VEC_DATA_TYPE(int, size) \
+ x = a + (1 << (k_fractional_bits - 3)); \
+ VEC_DATA_TYPE(int, size) \
+ x2 = ASYMM_MULT(x, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ x3 = ASYMM_MULT(x2, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4 = ASYMM_MULT(x2, x2, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_24_plus_x3_over_6_plus_x2 = \
+ ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_24_plus_x3_over_6_plus_x2_over_2 = \
+ ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \
+ return constant_term + \
+ ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
+ }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding
+ * bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \
+ VEC_DATA_TYPE(int, size) then_val, \
+ VEC_DATA_TYPE(int, size) else_val) \
+ { \
+ return (if_mask & then_val) ^ (~if_mask & else_val); \
+ }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+ { \
+ const VEC_DATA_TYPE(int, size) all_zeros = 0; \
+ const VEC_DATA_TYPE(int, size) all_ones = ~0; \
+ return select(all_zeros, all_ones, a == 0); \
+ }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+ { \
+ const VEC_DATA_TYPE(int, size) all_zeros = 0; \
+ const VEC_DATA_TYPE(int, size) all_ones = ~0; \
+ return select(all_zeros, all_ones, a != 0); \
+ }
+
+#define EXP_BARREL_SHIFTER_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \
+ VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+ int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+ { \
+ if (k_integer_bits > exponent) \
+ { \
+ const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
+ return ASYMM_SELECT_USING_MASK( \
+ ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \
+ ASYMM_MULT(result, fp_multiplier, size), result, size); \
+ } \
+ \
+ return result; \
+ }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+ { \
+ const int k_fractional_bits = 31 - k_integer_bits; \
+ VEC_DATA_TYPE(int, size) \
+ k_one_quarter = 1 << (k_fractional_bits - 2); \
+ VEC_DATA_TYPE(int, size) \
+ mask = k_one_quarter - 1; \
+ VEC_DATA_TYPE(int, size) \
+ a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \
+ VEC_DATA_TYPE(int, size) \
+ a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \
+ VEC_DATA_TYPE(int, size) \
+ result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \
+ a_mod_quarter_minus_one_quarter_scaled, size); \
+ VEC_DATA_TYPE(int, size) \
+ remainder = a_mod_quarter_minus_one_quarter - a; \
+ \
+ result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
+ size); \
+ result = \
+ EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \
+ \
+ if (k_integer_bits > 5) \
+ { \
+ const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \
+ result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
+ } \
+ \
+ const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
+ return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \
+ }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+ { \
+ if (exponent < 0) \
+ { \
+ return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \
+ } \
+ \
+ const VEC_DATA_TYPE(int, size) min = INT_MIN; \
+ const VEC_DATA_TYPE(int, size) max = INT_MAX; \
+ int threshold = ((1 << (31 - exponent)) - 1); \
+ VEC_DATA_TYPE(int, size) \
+ positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \
+ VEC_DATA_TYPE(int, size) \
+ negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \
+ VEC_DATA_TYPE(int, size) \
+ result = x << exponent; \
+ result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \
+ result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \
+ return result; \
+ }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+ { \
+ VEC_DATA_TYPE(long, size) \
+ a64 = convert_long##size(a); \
+ VEC_DATA_TYPE(long, size) \
+ b64 = convert_long##size(b); \
+ VEC_DATA_TYPE(long, size) \
+ sum = a64 + b64; \
+ const VEC_DATA_TYPE(long, size) one = 1; \
+ const VEC_DATA_TYPE(long, size) minus_one = -1; \
+ VEC_DATA_TYPE(long, size) \
+ sign = select(minus_one, one, sum >= 0); \
+ return convert_int##size((sum + sign) / 2); \
+ }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+ { \
+ const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
+ const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \
+ VEC_DATA_TYPE(int, size) \
+ half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \
+ const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \
+ const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \
+ VEC_DATA_TYPE(int, size) \
+ x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
+ for (int i = 0; i < 3; i++) \
+ { \
+ VEC_DATA_TYPE(int, size) \
+ half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \
+ VEC_DATA_TYPE(int, size) \
+ tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \
+ x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \
+ } \
+ return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \
+ }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value
+ * accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \
+ int src_integer_bits, int dst_integer_bits) \
+ { \
+ int exponent = src_integer_bits - dst_integer_bits; \
+ return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \
+ }
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
+ asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+ asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+ asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+ remainder, size) \
+ exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+ remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \
+ asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \
+ asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+ asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+ asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H \ No newline at end of file
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
new file mode 100644
index 000000000..512c62023
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+/** Performs a pixelwise division with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] scale Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_div_float(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out),
+ const float scale)
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+ VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+ // Perform division
+#ifdef DATA_TYPE_FLOAT
+ VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+ res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+#else /* DATA_TYPE_FLOAT */
+ VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+ res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+#endif /* DATA_TYPE_FLOAT */
+
+ // Store result
+ vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
new file mode 100644
index 000000000..82edf3b1d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FIXED_POINT_POSITION)
+
+#include "fixed_point.h"
+
+#if defined(SATURATE)
+#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#else // SATURATE
+#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#endif // SATURATE
+
+#else // FIXED_POINT_POSITION
+
+#if defined(SATURATE)
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
+#else // SATURATE
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
+#endif // SATURATE
+#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
+
+#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
+
+#endif // FIXED_POINT_POSITION
+
+/** Performs a pixelwise division with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ *
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16
+ * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
+ */
+__kernel void pixelwise_div_int(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out),
+ const uint scale)
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+ VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+ // Perform division and store result
+ vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
new file mode 100644
index 000000000..ddc9d5a27
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * The following computations will be performed by the kernel:
+ *
+ * -# Add offset terms to inputs
+ * -# Multiply inputs
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8
+ * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] in2_ptr Pointer to the source image. Supported data types: U8
+ * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] scale Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_qasymm8(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out),
+ const float scale)
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ VEC_DATA_TYPE(int, 16)
+ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+ VEC_DATA_TYPE(int, 16)
+ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+ // Perform multiplication of two inputs
+ VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+ VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+ VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
+
+ // Multiply with a multiplier smaller than 1
+ out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+ out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+ VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+// TODO: Apply min-max BOUND to support fuse with relu.
+/*
+#if defined(MIN_BOUND)
+ res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+*/
+
+ // Store result
+ VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
+ 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
new file mode 100644
index 000000000..dfa3b85f4
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(WIDTH)
+/** Perform reduce max
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reduce_max(VECTOR_DECLARATION(input),
+ VECTOR_DECLARATION(output))
+{
+ Vector input = CONVERT_TO_VECTOR_STRUCT(input);
+ Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+
+ __global float *input_addr = (__global float *)(input.ptr);
+ __global float *output_addr = (__global float *)(output.ptr);
+
+ float max_value = *input_addr;
+ for(int x = 1; x < WIDTH; x++)
+ {
+ float value = *(input_addr + x);
+ max_value = max(value, max_value);
+ }
+
+ // Store max
+ *output_addr = max_value;
+}
+#endif // defined(WIDTH)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
new file mode 100644
index 000000000..1a96eea61
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+inline DATA_TYPE sum_8(__global const DATA_TYPE *input)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ in = vload8(0, input);
+ in.s0123 += in.s4567;
+ in.s01 += in.s23;
+ return ((in.s0 + in.s1));
+}
+
+/** This function calculates the sum and sum of squares of a given input image.
+ *
+ * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: U8
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] local_sum Local sum of all elements
+ * @param[in] height Height of the input image
+ * @param[in] divider Divider to calculate mean
+ */
+__kernel void reduction_mean(
+ IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ __local DATA_TYPE *local_sums,
+ int height,
+ int divider)
+{
+ // Get pixels pointer
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ float8 tmp_sum = 0;
+ // Calculate partial sum
+
+ for(int i = 0; i < height; i++)
+ {
+ local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i));
+ }
+ ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider;
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
new file mode 100644
index 000000000..c5ff82f9e
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+
+inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w)
+{
+ int stride_x = vector->stride_x;
+ int stride_y = stride_x * dim_x;
+ int stride_z = stride_y * dim_y;
+ int stride_w = stride_z * dim_z;
+ Tensor4D tensor =
+ {
+ .ptr = vector->ptr,
+ .offset_first_element_in_bytes = vector->offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z,
+ .stride_w = stride_w,
+ };
+ return tensor;
+}
+
+/** Extracts a strided slice up to 4-dimensions
+ *
+ * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
+ * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] dims_in The 4-dimensional dimension of the input. Supported data types: S32
+ * @param[in] dims_out The 4-dimensional dimension of the output. Supported data types: S32
+ * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32
+ * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32
+ */
+__kernel void strided_slice(VECTOR_DECLARATION(input),
+ VECTOR_DECLARATION(output),
+ const int4 dims_in,
+ const int4 dims_out,
+ const int4 starts,
+ const int4 strides)
+{
+ // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset
+ Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
+ Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+
+ // Implemenation
+ // Infer a Tensor4D from output Vector and output's dimensions info
+ // Infer a Tensor4D from input Vector and input's dimensions info
+ // Infer indices of output as 4D from the offset of output vector
+ // Infer indices of input as 4D from indices of output
+ // out(offset of output vector) = in(offset of input)
+
+ Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
+ Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w);
+
+ // Must be output_step_x == output_stride_x == an element's size
+ const int offset_out = get_global_id(0) * output_stride_x;
+ int4 indices_out =
+ {
+ get_global_id(0) % dims_out.x,
+ (offset_out / tensor_out.stride_y) % dims_out.y,
+ (offset_out / tensor_out.stride_z) % dims_out.z,
+ (offset_out / tensor_out.stride_w) % dims_out.w,
+ };
+
+ int4 indices_in =
+ {
+ starts.x + (strides.x * indices_out.x),
+ starts.y + (strides.y * indices_out.y),
+ starts.z + (strides.z * indices_out.z),
+ starts.w + (strides.w * indices_out.w),
+ };
+
+ *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
new file mode 100644
index 000000000..0b0cf8218
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+__kernel void topkv2_init(VECTOR_DECLARATION(input),
+ __global float* in_key_buf,
+ __global int* in_ind_buf,
+ const int n)
+{
+ int gid = get_global_id(0);
+ int lws = get_local_size(0);
+ int groups = get_num_groups(0);
+ int gws = lws * groups;
+ int iter = n / gws;
+
+ Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+
+ for(int i = 0; i < iter; ++i)
+ {
+ int idx = i * gws + gid;
+ in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x);
+ in_ind_buf[idx] = idx;
+ }
+}
+
+__kernel void topkv2_find_first_negative(
+ __global float *out_key_buf,
+ __global int *first_negative_idx,
+ int n)
+{
+ int gid = get_global_id(0);
+
+ if( gid == n - 1 )
+ {
+ // if the last item is positive, the first negative index is n.
+ if( out_key_buf[gid] > 0.f )
+ *first_negative_idx = n;
+ } else if ( gid == 0 ) {
+ // if the first item is negative, set it 0.
+ if( out_key_buf[gid] < 0.f )
+ *first_negative_idx = 0;
+ } else {
+ // if its left is positive and it is negative, then it is the first negative item.
+ if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f )
+ *first_negative_idx = gid;
+ }
+}
+
+__kernel void topkv2_reorder_negatives(
+ __global float* in_key_buf,
+ __global float* out_key_buf,
+ __global float* in_ind_buf,
+ __global float* out_ind_buf,
+ __global int* first_negative_idx,
+ int n)
+{
+ int gid = get_global_id(0);
+
+ int num_negs = n - *first_negative_idx;
+ int in_idx;
+
+ if( gid < num_negs ) {
+ in_idx = n - 1 - gid;
+ } else {
+ in_idx = gid - num_negs;
+ }
+
+ out_key_buf[gid] = in_key_buf[in_idx];
+ out_ind_buf[gid] = in_ind_buf[in_idx];
+}
+
+__kernel void topkv2_store(
+ VECTOR_DECLARATION(values),
+ VECTOR_DECLARATION(indices),
+ __global float *out_key_buf,
+ __global int *out_ind_buf,
+ int n)
+{
+ int gid = get_global_id(0);
+
+ Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
+ Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+
+ int idx = n - 1 - gid;
+
+ *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
+ *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
new file mode 100644
index 000000000..deadf8412
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+__global inline float* get_vec_elem(Vector* vec, int idx)
+{
+ return (__global float*)(vec->ptr + idx * vec->stride_x);
+}
+
+__global inline int* get_vec_elem_int(Vector* vec, int idx)
+{
+ return (__global int*)(vec->ptr + idx * vec->stride_x);
+}
+
+// A utility function to swap two elements
+void swap(__global float *a, __global float *b)
+{
+ float t = *a;
+ *a = *b;
+ *b = t;
+}
+
+void swap_idx(__global int *a, __global int *b)
+{
+ int t = *a;
+ *a = *b;
+ *b = t;
+}
+
+/* This function is same in both iterative and recursive*/
+int partition (Vector* arr, __global int* indices, int l, int h)
+{
+ float x = *get_vec_elem(arr, h);
+ int i = (l - 1);
+
+ for (int j = l; j <= h- 1; j++)
+ {
+ if (*get_vec_elem(arr, j) >= x)
+ {
+ i++;
+ swap (get_vec_elem(arr,i), get_vec_elem(arr,j));
+ swap_idx(&indices[i], &indices[j]);
+ }
+ }
+ swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
+ swap_idx(&indices[i + 1], &indices[h]);
+ return (i + 1);
+}
+
+/* A[] --> Array to be sorted,
+ l --> Starting index,
+ h --> Ending index */
+void quickSortIterative (Vector* arr, __global int* indices,
+ __global int *stack, int l, int h)
+{
+ // Create an auxiliary stack
+
+ // initialize top of stack
+ int top = -1;
+
+ // push initial values of l and h to stack
+ stack[ ++top ] = l;
+ stack[ ++top ] = h;
+
+ // Keep popping from stack while is not empty
+ while ( top >= 0 )
+ {
+ // Pop h and l
+ h = stack[ top-- ];
+ l = stack[ top-- ];
+
+ // Set pivot element at its correct position
+ // in sorted array
+ int p = partition( arr, indices, l, h );
+
+ // If there are elements on left side of pivot,
+ // then push left side to stack
+ if ( p-1 > l )
+ {
+ stack[ ++top ] = l;
+ stack[ ++top ] = p - 1;
+ }
+
+ // If there are elements on right side of pivot,
+ // then push right side to stack
+ if ( p+1 < h )
+ {
+ stack[ ++top ] = p + 1;
+ stack[ ++top ] = h;
+ }
+ }
+}
+
+__kernel void topkv2_quicksort(VECTOR_DECLARATION(input),
+ VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices),
+ __global int* indices, __global int* temp_stack, int k, int n)
+{
+ Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+ Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
+ Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
+
+ for( int i = 0; i < n; ++i )
+ {
+ indices[i] = i;
+ }
+
+ quickSortIterative(&input, indices, temp_stack, 0, n-1);
+
+ // extract k items.
+ for(int i = 0; i < k; ++i)
+ {
+ *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i);
+ *get_vec_elem_int(&topk_indices, i) = indices[i];
+ }
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
new file mode 100644
index 000000000..cac0c071e
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// reference:
+// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
+// OpenCL kernel sources for the CLRadixSort class
+// the #include does not exist in OpenCL
+// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
+// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
+// if you find this software usefull you can cite the following work in your reports or articles:
+// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
+// http://hal.archives-ouvertes.fr/hal-00596730
+
+// Reference for floating point radix sort:
+// http://www.codercorner.com/RadixSortRevisited.htm
+
+// compute the histogram for each radix and each virtual processor for the pass
+__kernel void radixsort_histogram(__global float* in_key_buf,
+ __global int* d_Histograms,
+ const int pass,
+ __local int* loc_histo,
+ const int n)
+{
+ int it = get_local_id(0); // i local number of the processor
+ int ig = get_global_id(0); // global number = i + g I
+
+ int gr = get_group_id(0); // g group number
+
+ int groups = get_num_groups(0);
+ int items = get_local_size(0);
+
+ // set the local histograms to zero
+ for(int ir=0;ir<_RADIX;ir++){
+ loc_histo[ir * items + it] = 0;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // range of keys that are analyzed by the work item
+ int size= n/groups/items; // size of the sub-list
+ int start= ig * size; // beginning of the sub-list
+
+ unsigned int key;
+ int shortkey,k;
+
+ // compute the index
+ // the computation depends on the transposition
+ for(int j = 0; j < size ; j++) {
+#ifdef TRANSPOSE
+ k= groups * items * j + ig;
+#else
+ k=j+start;
+#endif
+
+ key = *((__global unsigned int*)(in_key_buf + k));
+
+ // extract the group of _BITS bits of the pass
+ // the result is in the range 0.._RADIX-1
+ shortkey=(( key >> (pass * _BITS)) & (_RADIX-1));
+
+ // increment the local histogram
+ loc_histo[shortkey * items + it ]++;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // copy the local histogram to the global one
+ for(int ir=0;ir<_RADIX;ir++) {
+ d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// initial transpose of the list for improving
+// coalescent memory access
+__kernel void transpose(const __global int* invect,
+ __global int* outvect,
+ const int nbcol,
+ const int nbrow,
+ const __global int* inperm,
+ __global int* outperm,
+ __local int* blockmat,
+ __local int* blockperm,
+ const int tilesize){
+
+ int i0 = get_global_id(0)*tilesize; // first row index
+ int j = get_global_id(1); // column index
+
+ int jloc = get_local_id(1); // local column index
+
+ // fill the cache
+ for(int iloc=0;iloc<tilesize;iloc++){
+ int k=(i0+iloc)*nbcol+j; // position in the matrix
+ blockmat[iloc*tilesize+jloc]=invect[k];
+#ifdef PERMUT
+ blockperm[iloc*tilesize+jloc]=inperm[k];
+#endif
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // first row index in the transpose
+ int j0=get_group_id(1)*tilesize;
+
+ // put the cache at the good place
+ for(int iloc=0;iloc<tilesize;iloc++){
+ int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose
+ outvect[kt]=blockmat[jloc*tilesize+iloc];
+#ifdef PERMUT
+ outperm[kt]=blockperm[jloc*tilesize+iloc];
+#endif
+ }
+
+}
+
+// each virtual processor reorders its data using the scanned histogram
+__kernel void radixsort_reorder(__global float* in_key,
+ __global float* out_key,
+ __global int* d_Histograms,
+ const int pass,
+ __global int* indices_in,
+ __global int* indices_out,
+ __local int* loc_histo,
+ const int n){
+
+ int it = get_local_id(0);
+ int ig = get_global_id(0);
+
+ int gr = get_group_id(0);
+ int groups=get_num_groups(0);
+ int items=get_local_size(0);
+
+ int start= ig *(n/groups/items);
+ int size= n/groups/items;
+
+ // take the histogram in the cache
+ for(int ir=0;ir<_RADIX;ir++){
+ loc_histo[ir * items + it]=
+ d_Histograms[items * (ir * groups + gr) + it];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int newpos,shortkey,k,newpost;
+ unsigned int key;
+
+ for(int j= 0; j< size;j++){
+#ifdef TRANSPOSE
+ k= groups * items * j + ig;
+#else
+ k=j+start;
+#endif
+ float org_value = in_key[k];
+ key = *(__global unsigned int*)(in_key + k);
+ shortkey=((key >> (pass * _BITS)) & (_RADIX-1));
+
+ newpos=loc_histo[shortkey * items + it];
+
+#ifdef TRANSPOSE
+ int ignew,jnew;
+ ignew= newpos/(n/groups/items);
+ jnew = newpos%(n/groups/items);
+ newpost = jnew * (groups*items) + ignew;
+#else
+ newpost=newpos;
+#endif
+
+ //d_outKeys[newpost]= key; // killing line !!!
+ out_key[newpost] = org_value;
+
+#ifdef PERMUT
+ indices_out[newpost] = indices_in[k];
+#endif
+
+ newpos++;
+ loc_histo[shortkey * items + it]=newpos;
+ }
+}
+
+// perform a parallel prefix sum (a scan) on the local histograms
+// (see Blelloch 1990) each workitem worries about two memories
+// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum)
+{
+ int it = get_local_id(0);
+ int ig = get_global_id(0);
+ int decale = 1;
+ int n=get_local_size(0) * 2 ;
+ int gr=get_group_id(0);
+
+ // load input into local memory
+ // up sweep phase
+ temp[2*it] = histo[2*ig];
+ temp[2*it+1] = histo[2*ig+1];
+
+ // parallel prefix sum (algorithm of Blelloch 1990)
+ for (int d = n>>1; d > 0; d >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (it < d){
+ int ai = decale*(2*it+1)-1;
+ int bi = decale*(2*it+2)-1;
+ temp[bi] += temp[ai];
+ }
+ decale *= 2;
+ }
+
+ // store the last element in the global sum vector
+ // (maybe used in the next step for constructing the global scan)
+ // clear the last element
+ if (it == 0) {
+ globsum[gr]=temp[n-1];
+ temp[n - 1] = 0;
+ }
+
+ // down sweep phase
+ for (int d = 1; d < n; d *= 2){
+ decale >>= 1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (it < d){
+ int ai = decale*(2*it+1)-1;
+ int bi = decale*(2*it+2)-1;
+
+ int t = temp[ai];
+ temp[ai] = temp[bi];
+ temp[bi] += t;
+ }
+
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // write results to device memory
+
+ histo[2*ig] = temp[2*it];
+ histo[2*ig+1] = temp[2*it+1];
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+}
+
+// use the global sum for updating the local histograms
+// each work item updates two values
+__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum)
+{
+ int ig = get_global_id(0);
+ int gr=get_group_id(0);
+
+ int s;
+
+ s=globsum[gr];
+
+ // write results to device memory
+ histo[2*ig] += s;
+ histo[2*ig+1] += s;
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
new file mode 100644
index 000000000..b019e8c33
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ // Create kernel
+ if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ const float scale_in = input->info()->quantization_info().scale;
+ const int offset_in = input->info()->quantization_info().offset;
+ build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
+ build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
+ }
+ else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
+ {
+ const float scale_in = output->info()->quantization_info().scale;
+ const int offset_in = output->info()->quantization_info().offset;
+ build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
+ build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
+ }
+ else
+ {
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts));
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
new file mode 100644
index 000000000..23efafa6a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
+ DataType::F32);
+
+ return Status{};
+}
+
+} // namespace
+
+CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {}
+
+void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Construct kernel name
+ std::string kernel_name = "gather";
+ if (input1->info()->num_dimensions() == 1)
+ {
+ kernel_name = "gather_1d";
+ }
+ else if (input1->info()->num_dimensions() == 2)
+ {
+ if (_output->info()->num_dimensions() == 1)
+ {
+ kernel_name = "gather_1d_out";
+ }
+ }
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 1;
+ Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
+
+ return Status{};
+}
+
+void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ if (_input1->info()->num_dimensions() == 1)
+ {
+ Window slice = window.first_slice_window_1D();
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input1, slice);
+ add_1D_tensor_argument(idx, _input2, slice);
+ add_1D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ else if (_input1->info()->num_dimensions() == 2)
+ {
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+ Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
+
+ // Set inputs
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, window_collapsed);
+ add_1D_tensor_argument(idx, _input2, slice);
+ if (_output->info()->num_dimensions() == 1)
+ {
+ add_1D_tensor_argument(idx, _output, slice);
+ }
+ else
+ {
+ add_2D_tensor_argument(idx, _output, window_collapsed);
+ }
+ enqueue(queue, *this, slice);
+ }
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
new file mode 100644
index 000000000..a3e0163de
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy)
+{
+ ARM_COMPUTE_UNUSED(overflow_policy);
+ ARM_COMPUTE_UNUSED(rounding_policy);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8,
+ DataType::QS16, DataType::S16, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8,
+ DataType::QS16, DataType::S16, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
+
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
+
+ if (is_data_type_fixed_point(input1->data_type()))
+ {
+ // All data types must be all QS8 or all QS16
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1,
+ "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
+ }
+
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8,
+ DataType::QS16, DataType::S16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->data_type() == DataType::U8 &&
+ (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+ "Output can only be U8 if both inputs are U8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
+ if (is_data_type_fixed_point(input1->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ }
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
+ ITensorInfo *output)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output, out_shape);
+
+ if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output, Format::S16);
+ }
+ else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output, Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
+ scale, overflow_policy, rounding_policy));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ int scale_int = -1;
+ // Extract sign, exponent and mantissa
+ int exponent = 0;
+ float normalized_mantissa = std::frexp(scale, &exponent);
+ // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+ // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <=
+ // 14
+ // Moreover, it will be negative as we deal with 1/2^n
+ if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+ {
+ // Store the positive exponent. We know that we compute 1/2^n
+ // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+ scale_int = std::abs(exponent - 1);
+ }
+
+ std::string data_type;
+ std::string compute_type;
+ // Check if it has float inputs and output
+ if (is_data_type_float(input1->info()->data_type()) ||
+ is_data_type_float(input2->info()->data_type()))
+ {
+ scale_int = -1;
+ compute_type = (input1->info()->data_type() == DataType::F32 ||
+ input2->info()->data_type() == DataType::F32)
+ ? "float"
+ : "half";
+ data_type = "DATA_TYPE_FLOAT";
+ }
+ else
+ {
+ if (input1->info()->data_type() == DataType::S16 ||
+ input2->info()->data_type() == DataType::S16)
+ {
+ compute_type = "int";
+ }
+ else if (input1->info()->data_type() == DataType::QS8)
+ {
+ compute_type = "qs8";
+ }
+ else if (input1->info()->data_type() == DataType::QS16)
+ {
+ compute_type = "qs16";
+ }
+ else
+ {
+ compute_type = "ushort";
+ }
+ data_type = "DATA_TYPE_INT";
+ }
+
+ // Construct kernel name
+ std::string kernel_name = "pixelwise_div";
+ kernel_name += (scale_int >= 0) ? "_int" : "_float";
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(
+ (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()))
+ ? "-DWRAP"
+ : "-DSATURATE");
+ build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
+ : "-DROUND=_rte");
+ if (is_data_type_fixed_point(input1->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" +
+ support::cpp11::to_string(input1->info()->fixed_point_position()));
+ }
+ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
+ build_opts.emplace("-D" + data_type);
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Set scale argument
+ unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
+
+ if (scale_int >= 0)
+ {
+ _kernel.setArg(idx++, scale_int);
+ }
+ else
+ {
+ _kernel.setArg(idx++, scale);
+ }
+
+ ICLKernel::configure(win_config.second);
+}
+
+Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
+ input2->clone().get(),
+ output->clone().get())
+ .first);
+
+ return Status{};
+}
+
+void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPixelWiseDivisionKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
new file mode 100644
index 000000000..168b246bf
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
+{
+ // We can handle for simple case only
+ // Input rank: 2
+ // Output rank: 1
+ // Axis: one axis value, restrict to 1
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
+ "Output same type allowed for input and output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
+ "Only support for output dimension 1");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
+ "Only support for input dimension 2");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {}
+
+void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ // Configure kernel window
+ int cols = _input->info()->tensor_shape()[0];
+ int rows = _input->info()->tensor_shape()[1];
+ Window win;
+ win.set(0, Window::Dimension(0, cols, 1));
+ win.set(1, Window::Dimension(0, rows, 1));
+
+ // Construct kernel name
+ std::string kernel_name = "reduce_max";
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ ICLKernel::configure(win);
+}
+
+Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
+
+ return Status{};
+}
+
+void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window window_input = window;
+ Window slice_input = window_input.first_slice_window_1D();
+
+ do
+ {
+ Window slice_output = slice_input.shift_dimensions(1);
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, slice_input);
+ add_1D_tensor_argument(idx, _output, slice_output);
+ enqueue(queue, *this, slice_input);
+
+ } while (window_input.slide_window_slice_1D(slice_input));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
new file mode 100644
index 000000000..84a77122d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ std::vector<uint32_t> axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
+
+ std::vector<uint32_t>::const_iterator it;
+ bool axis_w = false;
+ bool axis_h = false;
+ for (it = axis.begin(); it != axis.end(); ++it)
+ {
+ if ((*it) == 0)
+ {
+ axis_w = true;
+ }
+ else if ((*it) == 1)
+ {
+ axis_h = true;
+ }
+ else
+ {
+ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
+ }
+ }
+ // TODO Other axises (currently, only axises for both width and height are supported.)
+ if (!axis_w || !axis_h)
+ {
+ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
+ }
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+ std::vector<uint32_t> axis)
+{
+ // Output tensor auto initialization if not yet initialized
+ TensorShape output_shape{input->tensor_shape()};
+ output_shape.set(0, 1);
+ output_shape.set(1, 1);
+ auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(),
+ input->fixed_point_position());
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
+ const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
+
+ Window win = calculate_max_window(
+ *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+ num_elems_processed_per_iteration_y);
+ AccessWindowHorizontal output_access(output, 0, 1);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+
+ return std::make_tuple(err, win);
+}
+} // namespace
+
+CLReductionMeanKernel::CLReductionMeanKernel()
+ : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size()
+{
+}
+
+BorderSize CLReductionMeanKernel::border_size() const { return _border_size; }
+
+void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output,
+ std::vector<uint32_t> axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis));
+
+ _input = input;
+ _output = output;
+ _reduction_axis = axis;
+
+ constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
+
+ // Set border size
+ _border_size = BorderSize(
+ ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) -
+ input->info()->dimension(0));
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ // build_opts.emplace(("-DVEC_SIZE=" +
+ // support::cpp11::to_string(num_elems_processed_per_iteration)));
+ if (is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" +
+ support::cpp11::to_string(input->info()->fixed_point_position()));
+ }
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ ICLKernel::configure(std::get<1>(win_config));
+}
+
+Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ std::vector<uint32_t> axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+
+ return Status{};
+}
+
+void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Set out window
+ Window out_window(window);
+ out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ // Get first input and output slices
+ Window in_slice = window.first_slice_window_2D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ // Set local sums buffer
+ // TODO work_group
+ unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
+
+ unsigned int idx = 2 * num_arguments_per_2D_tensor();
+ _kernel.setArg(idx++, local_sum_size, nullptr);
+ _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height
+ _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) *
+ _input->info()->dimension(1))); // divider
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice);
+ } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp
new file mode 100644
index 000000000..80ffd423a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace std;
+using namespace arm_compute;
+
+static const int32_t maxDim = 4;
+
+CLStridedSliceKernel::CLStridedSliceKernel()
+ : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
+ _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
+{
+}
+
+Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *begin, const ITensorInfo *end,
+ const ITensorInfo *strides, int32_t beginMask,
+ int32_t endMask, int32_t shrinkAxisMask)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16,
+ DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(),
+ strides->tensor_shape());
+
+ return Status{};
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axisSize - 1] that can be used to index
+// directly into the data.
+inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride,
+ const TensorShape &inputShape, int32_t axis)
+{
+ // Begin with the specified index
+ int32_t start = begin;
+
+ // beginMask override
+ if (beginMask & 1 << axis)
+ {
+ if (stride > 0)
+ {
+ // Forward iteration - use the first element. These values will get
+ // clamped below (Note: We could have set them to 0 and axisSize-1, but
+ // use lowest() and max() to maintain symmetry with StopForAxis())
+ start = std::numeric_limits<int32_t>::lowest();
+ }
+ else
+ {
+ // Backward iteration - use the last element.
+ start = std::numeric_limits<int32_t>::max();
+ }
+ }
+
+ // Handle negative indices
+ int32_t axisSize = inputShape[axis];
+ if (start < 0)
+ {
+ start += axisSize;
+ }
+
+ // Clamping
+ start = arm_compute::utility::clamp(start, 0, axisSize - 1);
+
+ return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
+ const TensorShape &inputShape, int32_t axis)
+{
+ // Begin with the specified index
+ int32_t stop = end;
+
+ // endMask override
+ if (endMask & (1 << axis))
+ {
+ if (stride > 0)
+ {
+ // Forward iteration - use the last element. These values will get
+ // clamped below
+ stop = std::numeric_limits<int32_t>::max();
+ }
+ else
+ {
+ // Backward iteration - use the first element.
+ stop = std::numeric_limits<int32_t>::lowest();
+ }
+ }
+
+ // Handle negative indices
+ int32_t axisSize = inputShape[axis];
+ if (stop < 0)
+ {
+ stop += axisSize;
+ }
+
+ // Clamping
+ // Because the end index points one past the last element, we need slightly
+ // different clamping ranges depending on the direction.
+ if (stride > 0)
+ {
+ // Forward iteration
+ stop = arm_compute::utility::clamp(stop, 0, axisSize);
+ }
+ else
+ {
+ // Backward iteration
+ stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
+ }
+
+ return stop;
+}
+
+inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
+{
+ int32_t offset = b * shape[2] * shape[1] * shape[0];
+ offset += d * shape[1] * shape[0];
+ offset += h * shape[0];
+ offset += w;
+ return offset;
+}
+
+inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
+{
+ int32_t ret = 0;
+ if (stride > 0)
+ {
+ ret = ((stop - start - 1) / stride) + 1;
+ }
+ else
+ {
+ ret = ((stop - start + 1) / stride) + 1;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number");
+ return ret;
+}
+
+void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
+ ICLTensor *beginData, ICLTensor *endData,
+ ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
+ endData->info(), stridesData->info(), beginMask, endMask,
+ shrinkAxisMask));
+
+ _input = input;
+ _output = output;
+ _beginData = beginData;
+ _endData = endData;
+ _stridesData = stridesData;
+ _beginMask = beginMask;
+ _endMask = endMask;
+ _shrinkAxisMask = shrinkAxisMask;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DELEMENT_DATA_TYPE=" +
+ get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts));
+
+ // Create output's window without padding
+ TensorShape collapsed = output->info()->tensor_shape();
+ collapsed.collapse(4);
+ TensorInfo info = *output->info();
+ info.set_tensor_shape(collapsed);
+ Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration));
+
+ ICLKernel::configure(win);
+}
+
+void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Create input window
+ TensorShape collapsed = _input->info()->tensor_shape();
+ collapsed.collapse(4);
+ TensorInfo info = *_input->info();
+ info.set_tensor_shape(collapsed);
+ Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size()));
+
+ _beginData->map(queue);
+ _endData->map(queue);
+ _stridesData->map(queue);
+
+ std::vector<int32_t> dimsIn;
+ std::vector<int32_t> dimsOut;
+ std::vector<int32_t> starts;
+ std::vector<int32_t> stops;
+ std::vector<int32_t> strides;
+
+ for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
+ {
+ const TensorShape shape = _input->info()->tensor_shape();
+ starts.emplace_back(
+ StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
+ reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
+
+ stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n],
+ reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape,
+ n));
+
+ strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
+ dimsIn.emplace_back(shape[n]);
+ dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n]));
+ }
+
+ for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
+ {
+ starts.emplace_back(0);
+ stops.emplace_back(1);
+ strides.emplace_back(1);
+ dimsIn.emplace_back(1);
+ dimsOut.emplace_back(1);
+ }
+ // TODO: Apply shrinkAxisMask
+
+ _beginData->unmap(queue);
+ _stridesData->unmap(queue);
+ _endData->unmap(queue);
+
+ // Set parameters
+ unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+ const cl_int4 dimsInArg = {{
+ static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]),
+ static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]),
+ }};
+ _kernel.setArg<cl_int4>(idx++, dimsInArg);
+
+ const cl_int4 dimsOutArg = {{
+ static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]),
+ static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]),
+ }};
+ _kernel.setArg<cl_int4>(idx++, dimsOutArg);
+
+ const cl_int4 startsArg = {{
+ static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
+ static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
+ }};
+ _kernel.setArg<cl_int4>(idx++, startsArg);
+
+ const cl_int4 stridesArg = {{
+ static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]),
+ static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]),
+ }};
+ _kernel.setArg<cl_int4>(idx++, stridesArg);
+
+ // TODO: Apply slicing output's window
+ idx = 0;
+ add_1D_tensor_argument(idx, _input, win_in);
+ add_1D_tensor_argument(idx, _output, window);
+
+ enqueue(queue, *this, window);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
new file mode 100644
index 000000000..d95b485b7
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <climits>
+#include <cassert>
+
+namespace arm_compute
+{
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
+
+void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+ cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
+{
+ ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ _input = input;
+ _topk_values = topk_values;
+ _topk_indices = topk_indices;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
+
+ unsigned int idx = 3 * num_arguments_per_1D_tensor();
+ _kernel.setArg(idx++, *indices);
+ _kernel.setArg(idx++, *temp_stack);
+ _kernel.setArg<cl_int>(idx++, k);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, 1, 1));
+ ICLKernel::configure(win);
+}
+
+void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, window);
+ add_1D_tensor_argument(idx, _topk_values, window);
+ add_1D_tensor_argument(idx, _topk_indices, window);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
+
+void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
+ int n)
+{
+ ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ _input = input;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
+
+ unsigned int idx = num_arguments_per_1D_tensor();
+ _kernel.setArg(idx++, *in_key_buf);
+ _kernel.setArg(idx++, *in_ind_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure(win);
+}
+
+void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, window);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This kernel makes a histogram of radix for each work item.
+CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
+
+void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
+
+ int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
+
+ unsigned int idx = 1;
+ _kernel.setArg(idx++, *hist_buf);
+
+ idx = 3;
+ _kernel.setArg(idx++, loc_histo_size, nullptr);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+ ICLKernel::configure(win);
+}
+
+void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ _kernel.setArg(0, *_in_key_buf);
+ _kernel.setArg<cl_int>(2, _pass);
+
+ cl::NDRange lws = cl::NDRange(_ITEMS, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
+
+void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+ int temp_size =
+ std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *hist_buf);
+ _kernel.setArg(idx++, temp_size, nullptr);
+ _kernel.setArg(idx++, *glob_sum_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+ ICLKernel::configure(win);
+}
+
+void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
+
+void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
+ int bits)
+{
+ ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+ int temp_size =
+ std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *glob_sum_buf);
+ _kernel.setArg(idx++, temp_size, nullptr);
+ _kernel.setArg(idx++, *temp_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
+ ICLKernel::configure(win);
+}
+
+void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
+
+void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *hist_buf);
+ _kernel.setArg(idx++, *glob_sum_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+ ICLKernel::configure(win);
+}
+
+void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortReorder::CLRadixSortReorder()
+ : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
+ _out_ind_buf(nullptr)
+{
+}
+
+void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
+
+ unsigned int idx = 2;
+ _kernel.setArg(idx++, *hist_buf);
+
+ idx = 6;
+ _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+ ICLKernel::configure(win);
+}
+
+void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
+ cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
+
+ _kernel.setArg(0, *_in_key_buf);
+ _kernel.setArg(1, *_out_key_buf);
+ _kernel.setArg<cl_int>(3, _pass);
+ _kernel.setArg(4, *_in_ind_buf);
+ _kernel.setArg(5, *_out_ind_buf);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
+
+void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+ ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
+
+ unsigned int idx = 1;
+ _kernel.setArg(idx++, *first_negative_idx_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure(win);
+}
+
+void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *_out_key_buf);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
+ : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+ ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
+
+ unsigned int idx = 4;
+ _kernel.setArg(idx++, *first_negative_idx_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure(win);
+}
+
+void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *_in_key_buf);
+ _kernel.setArg(idx++, *_out_key_buf);
+ _kernel.setArg(idx++, *_in_ind_buf);
+ _kernel.setArg(idx++, *_out_ind_buf);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Store::CLTopKV2Store()
+ : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
+{
+ ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(k == 0);
+ ARM_COMPUTE_ERROR_ON(k > n);
+
+ _values = values;
+ _indices = indices;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
+
+ unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, k, 1));
+ ICLKernel::configure(win);
+}
+
+void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
+{
+ _out_key_buf = out_key_buf;
+ _out_ind_buf = out_ind_buf;
+}
+
+void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _values, window);
+ add_1D_tensor_argument(idx, _indices, window);
+ _kernel.setArg(idx++, *_out_key_buf);
+ _kernel.setArg(idx++, *_out_ind_buf);
+
+ enqueue(queue, *this, window);
+}
+
+} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 000000000..e1059ab53
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLCast::configure(ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
new file mode 100644
index 000000000..5552cbc6f
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLGather::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status CLGather::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ return CLGatherKernel::validate(input1, input2, output);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
new file mode 100644
index 000000000..e1add5e90
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLPixelWiseDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ float scale, ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseDivisionKernel>();
+ k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status CLPixelWiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, float scale,
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+ return CLPixelWiseDivisionKernel::validate(input1, input2, output, scale, overflow_policy,
+ rounding_policy);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
new file mode 100644
index 000000000..3382058db
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceMax.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/ToolchainSupport.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
+
+#include <vector>
+#include <algorithm>
+
+#include <utility>
+
+#define REDUCE_MAX_RUN_ON_CPU 1
+
+namespace arm_compute
+{
+
+CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {}
+
+void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output)
+{
+ _axis = axis;
+
+ _input = input;
+ _output = output;
+
+ auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
+ k->configure(input, axis, output);
+ _kernel = std::move(k);
+
+ // We can handle for simple case only
+ // Output rank: 1
+ // Axis: one axis value, restrict to 1
+ ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2);
+ ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1);
+ ARM_COMPUTE_ERROR_ON(axis != 1);
+}
+
+Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
+{
+ return CLReduceMaxKernel::validate(input, axis, output);
+}
+
+void CLReduceMax::run()
+{
+#if REDUCE_MAX_RUN_ON_CPU
+ run_on_cpu();
+
+ arm_compute::CLScheduler::get().sync();
+#else
+ arm_compute::CLScheduler::get().enqueue(*_kernel);
+#endif
+}
+
+void CLReduceMax::run_on_cpu()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ _input->map(q);
+ _output->map(q);
+
+ // Compute by CPU for simple case
+ // Input rank: 2
+ // Output rank: 1
+ // Axis: one axis value, restrict to 1
+
+ float *input_data = (float *)_input->buffer();
+ float *output_data = (float *)_output->buffer();
+
+ std::vector<float> container_max;
+ int cols = _input->info()->tensor_shape()[0];
+ int rows = _input->info()->tensor_shape()[1];
+ container_max.resize(rows);
+
+ // Initialize as 1st element in row
+ float *input_pointer = input_data;
+ for (int i = 0; i < rows; i++)
+ {
+ container_max[i] = *input_pointer;
+ input_pointer += cols;
+ }
+
+ // Update max value in row
+ for (int i = 0; i < rows; i++)
+ {
+ float max_in_row = container_max[i];
+ for (int j = 1; j < cols; j++)
+ {
+ if (max_in_row < input_data[i * cols + j])
+ {
+ max_in_row = input_data[i * cols + j];
+ }
+ }
+ container_max[i] = max_in_row;
+ }
+
+ for (int i = 0; i < rows; i++)
+ {
+ output_data[i] = container_max[i];
+ }
+
+ _input->unmap(q);
+ _output->unmap(q);
+}
+} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
new file mode 100644
index 000000000..ab724e752
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReductionMean.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLReductionMean::CLReductionMean() : _reduction_mean_kernel(), _fill_border_kernel() {}
+
+Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output,
+ std::vector<uint32_t> axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis));
+ return Status{};
+}
+
+void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis)
+{
+ _reduction_mean_kernel.configure(input, output, axis);
+ _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT,
+ PixelValue(0));
+}
+
+void CLReductionMean::run()
+{
+ CLScheduler::get().enqueue(_fill_border_kernel);
+ CLScheduler::get().enqueue(_reduction_mean_kernel);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
new file mode 100644
index 000000000..cd576cec1
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+#include <vector>
+
+using namespace arm_compute;
+
+static const int32_t maxDims = 4;
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axisSize - 1] that can be used to index
+// directly into the data.
+inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices,
+ std::vector<int32_t> const &strides, const TensorShape &inputShape,
+ int32_t axis)
+{
+ // Begin with the specified index
+ int32_t start = startIndices[axis];
+
+ // beginMask override
+ if (beginMask & 1 << axis)
+ {
+ if (strides[axis] > 0)
+ {
+ // Forward iteration - use the first element. These values will get
+ // clamped below (Note: We could have set them to 0 and axisSize-1, but
+ // use lowest() and max() to maintain symmetry with StopForAxis())
+ start = std::numeric_limits<int32_t>::lowest();
+ }
+ else
+ {
+ // Backward iteration - use the last element.
+ start = std::numeric_limits<int32_t>::max();
+ }
+ }
+
+ // Handle negative indices
+ int32_t axisSize = inputShape[axis];
+ if (start < 0)
+ {
+ start += axisSize;
+ }
+
+ // Clamping
+ start = arm_compute::utility::clamp(start, 0, axisSize - 1);
+
+ return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices,
+ std::vector<int32_t> const &strides, const TensorShape &inputShape,
+ int32_t axis)
+{
+ // Begin with the specified index
+ int32_t stop = stopIndices[axis];
+
+ // endMask override
+ if (endMask & (1 << axis))
+ {
+ if (strides[axis] > 0)
+ {
+ // Forward iteration - use the last element. These values will get
+ // clamped below
+ stop = std::numeric_limits<int32_t>::max();
+ }
+ else
+ {
+ // Backward iteration - use the first element.
+ stop = std::numeric_limits<int32_t>::lowest();
+ }
+ }
+
+ // Handle negative indices
+ int32_t axisSize = inputShape[axis];
+ if (stop < 0)
+ {
+ stop += axisSize;
+ }
+
+ // Clamping
+ // Because the end index points one past the last element, we need slightly
+ // different clamping ranges depending on the direction.
+ if (strides[axis] > 0)
+ {
+ // Forward iteration
+ stop = arm_compute::utility::clamp(stop, 0, axisSize);
+ }
+ else
+ {
+ // Backward iteration
+ stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
+ }
+
+ return stop;
+}
+
+inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
+{
+ int32_t offset = b * shape[2] * shape[1] * shape[0];
+ offset += d * shape[1] * shape[0];
+ offset += h * shape[0];
+ offset += w;
+ return offset;
+}
+
+void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+ ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+ int32_t endMask, int32_t shrinkAxisMask)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+ k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
+ _kernel = std::move(k);
+}
+
+void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+ ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+ int32_t endMask, int32_t shrinkAxisMask)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate(
+ input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(),
+ beginMask, endMask, shrinkAxisMask));
+
+ _input = input;
+ _output = output;
+ _beginData = beginData;
+ _endData = endData;
+ _stridesData = stridesData;
+ _beginMask = beginMask;
+ _endMask = endMask;
+ _shrinkAxisMask = shrinkAxisMask;
+}
+
+void CLStridedSliceCPU::run()
+{
+ run_on_cpu();
+
+ arm_compute::CLScheduler::get().sync();
+}
+
+inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
+{
+ if (stride > 0)
+ {
+ return ((stop - start - 1) / stride) + 1;
+ }
+ else
+ {
+ return ((stop - start + 1) / stride) + 1;
+ }
+}
+
+template <typename T>
+inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask,
+ int32_t endMask, const std::vector<int32_t> &startIndices,
+ const std::vector<int32_t> &stopIndices,
+ const std::vector<int32_t> &strides, T *outputData)
+{
+ ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims);
+ ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims);
+ ARM_COMPUTE_ERROR_ON(strides.size() != maxDims);
+
+ const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3);
+ const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3);
+ const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2);
+ const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2);
+ const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1);
+ const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1);
+ const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0);
+ const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0);
+
+ // The shape of outputData may collapse in one-dimension.
+ // Therefore, it is necessary to create a shape that matches the result of the outputData.
+ TensorShape outputShape(
+ getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]),
+ getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3]));
+ for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b;
+ in_b += strides[3], b++)
+ {
+ for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d;
+ in_d += strides[2], d++)
+ {
+ for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h;
+ in_h += strides[1], h++)
+ {
+ for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w;
+ in_w += strides[0], w++)
+ {
+ outputData[offset4D(outputShape, b, d, h, w)] =
+ inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)];
+ }
+ }
+ }
+ }
+}
+
+void CLStridedSliceCPU::run_on_cpu()
+{
+ // TODO: Support shrinkAxisMask
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ _input->map(q);
+ _output->map(q);
+ _beginData->map(q);
+ _endData->map(q);
+ _stridesData->map(q);
+
+ TensorShape inputShape = _input->info()->tensor_shape();
+ TensorShape outputShape = _output->info()->tensor_shape();
+
+ std::vector<int32_t> starts;
+ std::vector<int32_t> stops;
+ std::vector<int32_t> strides;
+
+ for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx)
+ {
+ starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]);
+ stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]);
+ strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]);
+ }
+
+ for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++)
+ {
+ starts.emplace_back(0);
+ stops.emplace_back(1);
+ strides.emplace_back(1);
+ }
+
+ switch (_input->info()->data_type())
+ {
+ case DataType::U8:
+ case DataType::QASYMM8:
+ StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask,
+ _endMask, starts, stops, strides,
+ reinterpret_cast<uint8_t *>(_output->buffer()));
+ break;
+ case DataType::S8:
+ case DataType::QS8:
+ StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask,
+ _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer()));
+ break;
+ case DataType::U16:
+ StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask,
+ _endMask, starts, stops, strides,
+ reinterpret_cast<uint16_t *>(_output->buffer()));
+ break;
+ case DataType::S16:
+ case DataType::QS16:
+ StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask,
+ _endMask, starts, stops, strides,
+ reinterpret_cast<int16_t *>(_output->buffer()));
+ break;
+ case DataType::F16:
+ // Not sure this works.
+ StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask,
+ _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer()));
+ break;
+ case DataType::U32:
+ StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask,
+ _endMask, starts, stops, strides,
+ reinterpret_cast<uint32_t *>(_output->buffer()));
+ break;
+ case DataType::S32:
+ StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask,
+ _endMask, starts, stops, strides,
+ reinterpret_cast<int32_t *>(_output->buffer()));
+ break;
+ case DataType::F32:
+ StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask,
+ _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer()));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DataType not supported");
+ break;
+ }
+
+ _input->unmap(q);
+ _output->unmap(q);
+ _beginData->unmap(q);
+ _endData->unmap(q);
+ _stridesData->unmap(q);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
new file mode 100644
index 000000000..6426364c9
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+
+#include <vector>
+#include <algorithm>
+
+#include "../../topk_v2.h"
+
+namespace arm_compute
+{
+
+CLTopKV2::CLTopKV2()
+ : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+ _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+ _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+ _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
+ _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr), _qs_kernel(),
+ _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+ _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+ _reorder_negatives_kernel(), _store_kernel()
+{
+}
+
+void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+ int total_bits, int bits)
+{
+ _total_bits = total_bits;
+ _bits = bits;
+ _n = input->info()->tensor_shape()[0];
+
+ // _total_bits should be divided by _bits.
+ ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
+
+ _k = k;
+ _radix = 1 << bits;
+
+ _input = input;
+ _values = values;
+ _indices = indices;
+
+ std::string topk_env;
+
+ char *env = getenv("ACL_TOPKV2");
+ if (env)
+ topk_env = env;
+
+ if (topk_env == "GPU_SINGLE")
+ {
+ _qs_idx_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+ _qs_temp_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+ _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
+ }
+ else if (topk_env == "GPU")
+ {
+ // n should be divided by (_GROUPS * _ITEMS)
+ ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0);
+
+ _hist_buf_size = _radix * _GROUPS * _ITEMS;
+ _glob_sum_buf_size = _HISTOSPLIT;
+
+ _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _hist_buf_size);
+ _glob_sum_buf =
+ cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _glob_sum_buf_size);
+ _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _glob_sum_buf_size);
+ _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int));
+ _in_key_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+ _out_key_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+ _in_ind_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+ _out_ind_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+ _p_in_key_buf = &_in_key_buf;
+ _p_out_key_buf = &_out_key_buf;
+ _p_in_ind_buf = &_in_ind_buf;
+ _p_out_ind_buf = &_out_ind_buf;
+
+ _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
+ _hist_kernel.configure(&_hist_buf, bits, _n);
+ _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+ _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
+ _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+ _reorder_kernel.configure(&_hist_buf, bits, _n);
+ _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
+ _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
+ _store_kernel.configure(values, indices, k, _n);
+ }
+ else
+ {
+ // DO NOTHING for CPU.
+ }
+}
+
+void CLTopKV2::run()
+{
+ std::string topk_env;
+
+ char *env = getenv("ACL_TOPKV2");
+ if (env)
+ topk_env = env;
+
+ if (topk_env == "GPU_SINGLE")
+ {
+ run_on_gpu_single_quicksort();
+ }
+ else if (topk_env == "GPU")
+ {
+ run_on_gpu();
+ }
+ else
+ {
+ run_on_cpu();
+ }
+}
+
+void CLTopKV2::run_on_gpu_single_quicksort()
+{
+ // This is a single threaded quick sort implementation.
+ CLScheduler::get().enqueue(_qs_kernel, false);
+
+ arm_compute::CLScheduler::get().sync();
+}
+
+void CLTopKV2::run_on_gpu()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ // 1. CLTopKV2Init set key buffer and index buffer.
+ // - Key buffer is set as the same value of the layer's input
+ // - Values in the index buffer are set as their indices.
+ CLScheduler::get().enqueue(_init_kernel, false);
+
+ int n_passes = _total_bits / _bits;
+
+ // 2. Repeat (total_bits/bits) times.
+ // - total_bits is the number of bits of the data type (e.g., 32 for float)
+ // - bits defines number of buckets (e.g. 16 buckets where bit is 4)
+ for (int pass = 0; pass < n_passes; ++pass)
+ {
+ arm_compute::CLScheduler::get().sync();
+
+ // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
+ _hist_kernel.setPass(pass, _p_in_key_buf);
+ CLScheduler::get().enqueue(_hist_kernel, false);
+
+ // 2.2. Calculate prefix sum locally with multiple threads
+ CLScheduler::get().enqueue(_scan_hist_kernel, false);
+ // 2.3. Calculate prefix sum within a work group
+ CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
+ // 2.4. Calculate global prefix sum
+ CLScheduler::get().enqueue(_paste_hist_kernel, false);
+
+ // 2.5. Reorder keys and indices based on the global prefix sum
+ _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf);
+ CLScheduler::get().enqueue(_reorder_kernel, false);
+
+ cl::Buffer *tmp;
+ // swap key buffers
+ tmp = _p_in_key_buf;
+ _p_in_key_buf = _p_out_key_buf;
+ _p_out_key_buf = tmp;
+
+ // swap index buffers
+ tmp = _p_in_ind_buf;
+ _p_in_ind_buf = _p_out_ind_buf;
+ _p_out_ind_buf = tmp;
+ }
+
+ // 3. Get the first negative index
+ // Because we swap in_buf and out_buf at the end of the above for loop,
+ // the output buffers are in bufs.
+ _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
+ CLScheduler::get().enqueue(_find_first_negative_kernel, false);
+
+ // 4. Correct odering of negatives
+ // - Since radix sort does not consider negatives, negatives are considered as bigger values
+ // than positives.
+ // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
+ _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf,
+ _p_out_ind_buf);
+ CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
+
+ // 5. Extract top k values from sorted keys and indices.
+ _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
+ CLScheduler::get().enqueue(_store_kernel, false);
+
+ arm_compute::CLScheduler::get().sync();
+
+#if 0
+ // below code is left for debugging.
+ int first_neg;
+ q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
+ std::cout << "first neg = " << first_neg << std::endl;
+
+ float in_key[_n];
+ q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
+ }
+
+ float out_key[_n];
+ q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
+ }
+
+ int in_ind[_n];
+ q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
+ }
+
+ int out_ind[_n];
+ q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
+ }
+
+ int hist_buf[_hist_buf_size];
+ q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
+ for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
+ std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
+ }
+
+ int glob_sum_buf[_glob_sum_buf_size];
+ q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
+ for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
+ std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
+ }
+
+#endif
+}
+
+void CLTopKV2::run_on_cpu()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+ // const Window& w = _topkv2_kernel.window();
+
+ _input->map(q);
+ _values->map(q);
+ _indices->map(q);
+
+ // int row_size = (w[0].end() - w[0].start()) / w[0].step();
+ int row_size = _input->info()->tensor_shape()[0];
+ int rank = _input->info()->num_dimensions();
+
+ if (rank > 2)
+ throw std::runtime_error("Not supported type.");
+
+ int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
+
+ if (_input->info()->data_type() == DataType::F32)
+ {
+ nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(), (float *)_values->buffer());
+ }
+ else if (_input->info()->data_type() == DataType::S32)
+ {
+ nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(),
+ (int32_t *)_values->buffer());
+ }
+ else if (_input->info()->data_type() == DataType::QASYMM8)
+ {
+ nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(),
+ (uint8_t *)_values->buffer());
+ }
+ else
+ {
+ throw std::runtime_error("Not supported type.");
+ }
+
+ _input->unmap(q);
+ _values->unmap(q);
+ _indices->unmap(q);
+}
+} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/topk_v2.h b/libs/ARMComputeEx/src/runtime/topk_v2.h
new file mode 100644
index 000000000..a18ff0b0d
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/topk_v2.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+
+typedef int32_t int32;
+
+namespace nnfw
+{
+namespace rt
+{
+namespace optimized_ops
+{
+// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
+// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
+// TFLite.
+//(TFLite additionaly supports kTfLiteInt64.)
+
+// The class that collects top indexes of k values. Based on template
+// tensorflow::gtl::TopN<> but, for optimization,
+// it re-uses the same container.
+template <typename T> class TopContainer
+{
+public:
+ TopContainer() = delete;
+ TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
+ {
+ container_.reserve(std::min(k, row_size) + 1);
+ }
+
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ TopContainer(const TopContainer &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ TopContainer &operator=(const TopContainer &) = delete;
+
+ void start_collecting(const T *values)
+ {
+ values_ = values;
+ container_.clear();
+ }
+
+ void push(int32 a)
+ {
+ auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+ if (container_.size() <= (size_t)k_)
+ {
+ container_.push_back(a);
+ if (container_.size() == (size_t)(k_ + 1))
+ {
+ std::make_heap(container_.begin(), container_.end(), comparator);
+ std::pop_heap(container_.begin(), container_.end(), comparator);
+ }
+ }
+ else if (comparator(a, container_.front()))
+ {
+ container_.back() = a;
+ std::push_heap(container_.begin(), container_.end(), comparator);
+ std::pop_heap(container_.begin(), container_.end(), comparator);
+ }
+ }
+
+ const std::vector<int32> &sorted_result()
+ {
+ auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+ if (container_.size() <= (size_t)(k_))
+ {
+ std::sort(container_.begin(), container_.end(), comparator);
+ }
+ else
+ {
+ std::sort_heap(container_.begin(), container_.end() - 1, comparator);
+ container_.resize(k_);
+ }
+ return container_;
+ }
+
+private:
+ int32 k_;
+ std::vector<int32> container_;
+ const T *values_ = nullptr;
+
+ bool compare_fun(int32 a, int32 b) const
+ {
+ if (values_[b] < values_[a])
+ {
+ return true;
+ }
+ else if (values_[b] > values_[a])
+ {
+ return false;
+ }
+ else
+ {
+ return a < b;
+ }
+ }
+};
+
+template <typename T>
+void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
+ T *output_values)
+{
+ TopContainer<T> topc(k, row_size);
+ for (int row = 0; row < num_rows; ++row)
+ {
+ const T *values_row = data + row * row_size;
+ topc.start_collecting(values_row);
+ for (int32 c = 0; c < row_size; ++c)
+ {
+ topc.push(c);
+ }
+
+ // Prepare output buffers.
+ int32 *indexes_row = output_indexes + row * k;
+ T *output_row = output_values + row * k;
+ // We always assume that the output is sorted.
+ const auto &top_k = topc.sorted_result();
+ std::copy(top_k.begin(), top_k.end(), indexes_row);
+ std::transform(top_k.begin(), top_k.end(), output_row,
+ [values_row](const int32 loc) { return values_row[loc]; });
+ }
+}
+
+} // namespace optimized_ops
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__