diff options
-rw-r--r-- | packaging/libarmcl.manifest | 5 | ||||
-rw-r--r-- | packaging/libarmcl.spec | 159 | ||||
-rw-r--r-- | packaging/patch.patch | 7025 |
3 files changed, 7189 insertions, 0 deletions
diff --git a/packaging/libarmcl.manifest b/packaging/libarmcl.manifest new file mode 100644 index 000000000..017d22d3a --- /dev/null +++ b/packaging/libarmcl.manifest @@ -0,0 +1,5 @@ +<manifest> + <request> + <domain name="_"/> + </request> +</manifest> diff --git a/packaging/libarmcl.spec b/packaging/libarmcl.spec new file mode 100644 index 000000000..092c4e238 --- /dev/null +++ b/packaging/libarmcl.spec @@ -0,0 +1,159 @@ +Name: libarmcl +Version: v19.05 +Release: 0 +License: MIT +Url: https://github.com/ARM-software/ComputeLibrary +Summary: The ARM Computer Vision and Machine Learning library +Group: Graphics & UI Framework/Libraries +Source0: %{name}-%{version}.tar.bz2 +Source1001: %name.manifest +ExclusiveArch: %{arm} aarch64 + +BuildRequires: python3-base +BuildRequires: python +BuildRequires: scons + +%define OPEN_CL_SUPPORT 1 +%define NEON_SUPPORT 1 +%define BENCHMARK_TEST 1 + +%description +The ARM Computer Vision and Machine Learning library is a set of functions optimised for both ARM CPUs and GPUs using SIMD technologies + +%package -n %{name}-release +Summary: ARM Compute Library file + +%description -n %{name}-release +Summary: ARM Compute Library file + +%package -n %{name}-devel +Summary: Userspace interface to ARM Compute Library + +%description -n %{name}-devel +Summary: Userspace interface to ARM Compute Library + +%package -n %{name}-tools +Summary: Sample application and benchmark binaries to test ARM Compute Library + +%description -n %{name}-tools +Summary: Sample application and benchmark binaries to test ARM Compute Library + +%prep +%setup -q +cp %{SOURCE1001} . + +%build +echo %{_builddir} +scons -j8 \ + Werror=0 \ + debug=0 \ +%if 0%{?NEON_SUPPORT} == 1 + neon=1 \ +%endif +%if 0%{?OPEN_CL_SUPPORT} == 1 + opencl=1 \ +%endif + os=linux \ +%ifarch aarch64 + arch=arm64-v8.2-a \ +%else + arch=armv7a \ +%endif + embed_kernels=1 \ +%if 0%{?BENCHMARK_TEST} == 1 + benchmark_tests=1 +%endif + +%post -p /sbin/ldconfig + +%postun -p /sbin/ldconfig + +%install +mkdir -p %{buildroot}%{_libdir} +mkdir -p %{buildroot}%{_libdir}/data +mkdir -p %{buildroot}%{_bindir} +mkdir -p %{buildroot}/usr/include/arm_compute +mkdir -p %{buildroot}/usr/include/support +mkdir -p %{buildroot}/usr/include/CL +mkdir -p %{buildroot}/usr/include/half +mkdir -p %{buildroot}/usr/include/libnpy + +install -m 644 build/libarm_compute_core.so %{buildroot}%{_libdir} +install -m 644 build/libarm_compute.so %{buildroot}%{_libdir} +install -m 644 build/libarm_compute_graph.so %{buildroot}%{_libdir} +install -m 644 build/opencl-1.2-stubs/libOpenCL.so %{buildroot}%{_libdir} + +install -m 644 build/examples/cl_convolution %{buildroot}%{_bindir} +install -m 644 build/examples/cl_events %{buildroot}%{_bindir} +install -m 644 build/examples/cl_sgemm %{buildroot}%{_bindir} +install -m 644 build/examples/graph_alexnet %{buildroot}%{_bindir} +install -m 644 build/examples/graph_googlenet %{buildroot}%{_bindir} +install -m 644 build/examples/graph_inception_v3 %{buildroot}%{_bindir} +install -m 644 build/examples/graph_inception_v4 %{buildroot}%{_bindir} +install -m 644 build/examples/graph_lenet %{buildroot}%{_bindir} +install -m 644 build/examples/graph_mobilenet %{buildroot}%{_bindir} +install -m 644 build/examples/graph_resnet50 %{buildroot}%{_bindir} +install -m 644 build/examples/graph_resnext50 %{buildroot}%{_bindir} +install -m 644 build/examples/graph_squeezenet %{buildroot}%{_bindir} +install -m 644 build/examples/graph_squeezenet_v1_1 %{buildroot}%{_bindir} +install -m 644 build/examples/graph_vgg16 %{buildroot}%{_bindir} +install -m 644 build/examples/graph_vgg19 %{buildroot}%{_bindir} +install -m 644 build/examples/neon_cartoon_effect %{buildroot}%{_bindir} +install -m 644 build/examples/neoncl_scale_median_gaussian %{buildroot}%{_bindir} +install -m 644 build/examples/neon_cnn %{buildroot}%{_bindir} +install -m 644 build/examples/neon_convolution %{buildroot}%{_bindir} +install -m 644 build/examples/neon_copy_objects %{buildroot}%{_bindir} +install -m 644 build/examples/neon_scale %{buildroot}%{_bindir} + +cp -r %{_builddir}/%{name}-%{version}/arm_compute/* %{buildroot}/usr/include/arm_compute/ +cp -r %{_builddir}/%{name}-%{version}/support/* %{buildroot}/usr/include/support/ +cp -r %{_builddir}/%{name}-%{version}/include/CL/* %{buildroot}/usr/include/CL/ +cp -r %{_builddir}/%{name}-%{version}/include/half/* %{buildroot}/usr/include/half/ +cp -r %{_builddir}/%{name}-%{version}/include/libnpy/* %{buildroot}/usr/include/libnpy/ + +%if 0%{?BENCHMARK_TEST} == 1 +install -m 644 %{_builddir}/%{name}-%{version}/build/tests/arm_compute_benchmark %{buildroot}%{_bindir} +cp -r %{_builddir}/%{name}-%{version}/data/* %{buildroot}%{_libdir}/data/ +%endif + +%files -n %{name}-release +%manifest %{name}.manifest +%{_libdir}/libarm_compute*.so + +%files -n %{name}-devel +%manifest %{name}.manifest +%{_libdir}/libarm_compute*.so +%{_libdir}/libOpenCL.so +%{_includedir}/arm_compute/* +%{_includedir}/support/* +%{_includedir}/CL/* +%{_includedir}/half/* +%{_includedir}/libnpy/* + +%files -n %{name}-tools +%manifest %{name}.manifest +%{_bindir}/cl_convolution +%{_bindir}/cl_events +%{_bindir}/cl_sgemm +%{_bindir}/graph_alexnet +%{_bindir}/graph_googlenet +%{_bindir}/graph_inception_v3 +%{_bindir}/graph_inception_v4 +%{_bindir}/graph_lenet +%{_bindir}/graph_mobilenet +%{_bindir}/graph_resnet50 +%{_bindir}/graph_resnext50 +%{_bindir}/graph_squeezenet +%{_bindir}/graph_squeezenet_v1_1 +%{_bindir}/graph_vgg16 +%{_bindir}/graph_vgg19 +%{_bindir}/neon_cartoon_effect +%{_bindir}/neoncl_scale_median_gaussian +%{_bindir}/neon_cnn +%{_bindir}/neon_convolution +%{_bindir}/neon_copy_objects +%{_bindir}/neon_scale +%if 0%{?BENCHMARK_TEST} == 1 +%{_bindir}/arm_compute_benchmark +%{_libdir}/data/* +%endif diff --git a/packaging/patch.patch b/packaging/patch.patch new file mode 100644 index 000000000..86a046cd4 --- /dev/null +++ b/packaging/patch.patch @@ -0,0 +1,7025 @@ +From eb0682abf46a5d1ee1c4bfc780815f948c912aca Mon Sep 17 00:00:00 2001 +From: Chunseok Lee <chunseok.lee@samsung.com> +Date: Thu, 23 Aug 2018 17:42:09 +0900 +Subject: [PATCH] Patch for NNFW M2 Release + +1. Add new operations +2. Fix some issue on existing ops + +Change-Id: I8da858291993ba474c8d285d8c63e75f5cf37083 +Signed-off-by: Chunseok Lee <chunseok.lee@samsung.com> +--- + .../core/CL/kernels/CLArithmeticAdditionKernel.h | 12 +- + .../CL/kernels/CLArithmeticSubtractionKernel.h | 2 + + arm_compute/core/CL/kernels/CLCastKernel.h | 65 +++ + arm_compute/core/CL/kernels/CLGatherKernel.h | 77 ++++ + .../core/CL/kernels/CLPixelWiseDivisionKernel.h | 88 ++++ + .../CL/kernels/CLPixelWiseMultiplicationKernel.h | 8 +- + arm_compute/core/CL/kernels/CLReduceMaxKernel.h | 78 ++++ + .../core/CL/kernels/CLReductionMeanKernel.h | 83 ++++ + arm_compute/core/CL/kernels/CLStridedSliceKernel.h | 106 +++++ + arm_compute/core/CL/kernels/CLTopKV2Kernel.h | 309 +++++++++++++ + arm_compute/core/Helpers.inl | 33 ++ + arm_compute/runtime/CL/CLFunctions.h | 8 + + .../runtime/CL/functions/CLArithmeticAddition.h | 12 +- + .../runtime/CL/functions/CLArithmeticSubtraction.h | 13 +- + arm_compute/runtime/CL/functions/CLCast.h | 52 +++ + arm_compute/runtime/CL/functions/CLGather.h | 56 +++ + .../runtime/CL/functions/CLPixelWiseDivision.h | 71 +++ + .../CL/functions/CLPixelWiseMultiplication.h | 8 +- + arm_compute/runtime/CL/functions/CLReduceMax.h | 89 ++++ + arm_compute/runtime/CL/functions/CLReductionMean.h | 76 ++++ + arm_compute/runtime/CL/functions/CLStridedSlice.h | 73 ++++ + arm_compute/runtime/CL/functions/CLTopKV2.h | 115 +++++ + src/core/CL/CLKernelLibrary.cpp | 72 ++++ + src/core/CL/cl_kernels/activation_layer_qa8.cl | 107 ++++- + src/core/CL/cl_kernels/arithmetic_op_quantized.cl | 138 ++++++ + src/core/CL/cl_kernels/cast.cl | 148 +++++++ + src/core/CL/cl_kernels/fixed_point.h | 24 ++ + src/core/CL/cl_kernels/gather.cl | 106 +++++ + src/core/CL/cl_kernels/pixelwise_div_float.cl | 96 +++++ + src/core/CL/cl_kernels/pixelwise_div_int.cl | 103 +++++ + src/core/CL/cl_kernels/pixelwise_mul_quantized.cl | 119 +++++ + src/core/CL/cl_kernels/reduce_max.cl | 60 +++ + src/core/CL/cl_kernels/reduction_mean.cl | 69 +++ + src/core/CL/cl_kernels/strided_slice.cl | 104 +++++ + src/core/CL/cl_kernels/topkv2.cl | 111 +++++ + src/core/CL/cl_kernels/topkv2_quicksort.cl | 138 ++++++ + src/core/CL/cl_kernels/topkv2_radixsort.cl | 279 ++++++++++++ + src/core/CL/kernels/CLActivationLayerKernel.cpp | 53 ++- + src/core/CL/kernels/CLArithmeticAdditionKernel.cpp | 46 +- + .../CL/kernels/CLArithmeticSubtractionKernel.cpp | 125 ++++-- + src/core/CL/kernels/CLCastKernel.cpp | 115 +++++ + src/core/CL/kernels/CLGatherKernel.cpp | 147 +++++++ + src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp | 284 ++++++++++++ + .../CL/kernels/CLPixelWiseMultiplicationKernel.cpp | 37 +- + src/core/CL/kernels/CLReduceMaxKernel.cpp | 135 ++++++ + src/core/CL/kernels/CLReductionMeanKernel.cpp | 190 ++++++++ + src/core/CL/kernels/CLStridedSliceKernel.cpp | 316 ++++++++++++++ + src/core/CL/kernels/CLTopKV2Kernel.cpp | 479 +++++++++++++++++++++ + src/core/Validate.cpp | 2 +- + .../CL/functions/CLArithmeticSubtraction.cpp | 14 +- + src/runtime/CL/functions/CLCast.cpp | 37 ++ + src/runtime/CL/functions/CLGather.cpp | 45 ++ + src/runtime/CL/functions/CLPixelWiseDivision.cpp | 57 +++ + src/runtime/CL/functions/CLReduceMax.cpp | 132 ++++++ + src/runtime/CL/functions/CLReductionMean.cpp | 60 +++ + src/runtime/CL/functions/CLStridedSlice.cpp | 288 +++++++++++++ + src/runtime/CL/functions/CLTopKV2.cpp | 310 +++++++++++++ + src/runtime/topk_v2.h | 141 ++++++ + 58 files changed, 6038 insertions(+), 83 deletions(-) + create mode 100644 arm_compute/core/CL/kernels/CLCastKernel.h + create mode 100644 arm_compute/core/CL/kernels/CLGatherKernel.h + create mode 100644 arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h + create mode 100644 arm_compute/core/CL/kernels/CLReduceMaxKernel.h + create mode 100644 arm_compute/core/CL/kernels/CLReductionMeanKernel.h + create mode 100644 arm_compute/core/CL/kernels/CLStridedSliceKernel.h + create mode 100644 arm_compute/core/CL/kernels/CLTopKV2Kernel.h + create mode 100644 arm_compute/runtime/CL/functions/CLCast.h + create mode 100644 arm_compute/runtime/CL/functions/CLGather.h + create mode 100644 arm_compute/runtime/CL/functions/CLPixelWiseDivision.h + create mode 100644 arm_compute/runtime/CL/functions/CLReduceMax.h + create mode 100644 arm_compute/runtime/CL/functions/CLReductionMean.h + create mode 100644 arm_compute/runtime/CL/functions/CLStridedSlice.h + create mode 100644 arm_compute/runtime/CL/functions/CLTopKV2.h + create mode 100644 src/core/CL/cl_kernels/arithmetic_op_quantized.cl + create mode 100644 src/core/CL/cl_kernels/cast.cl + create mode 100644 src/core/CL/cl_kernels/gather.cl + create mode 100644 src/core/CL/cl_kernels/pixelwise_div_float.cl + create mode 100644 src/core/CL/cl_kernels/pixelwise_div_int.cl + create mode 100644 src/core/CL/cl_kernels/pixelwise_mul_quantized.cl + create mode 100644 src/core/CL/cl_kernels/reduce_max.cl + create mode 100644 src/core/CL/cl_kernels/reduction_mean.cl + create mode 100644 src/core/CL/cl_kernels/strided_slice.cl + create mode 100644 src/core/CL/cl_kernels/topkv2.cl + create mode 100644 src/core/CL/cl_kernels/topkv2_quicksort.cl + create mode 100644 src/core/CL/cl_kernels/topkv2_radixsort.cl + create mode 100644 src/core/CL/kernels/CLCastKernel.cpp + create mode 100644 src/core/CL/kernels/CLGatherKernel.cpp + create mode 100644 src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp + create mode 100644 src/core/CL/kernels/CLReduceMaxKernel.cpp + create mode 100644 src/core/CL/kernels/CLReductionMeanKernel.cpp + create mode 100644 src/core/CL/kernels/CLStridedSliceKernel.cpp + create mode 100644 src/core/CL/kernels/CLTopKV2Kernel.cpp + create mode 100644 src/runtime/CL/functions/CLCast.cpp + create mode 100644 src/runtime/CL/functions/CLGather.cpp + create mode 100644 src/runtime/CL/functions/CLPixelWiseDivision.cpp + create mode 100644 src/runtime/CL/functions/CLReduceMax.cpp + create mode 100644 src/runtime/CL/functions/CLReductionMean.cpp + create mode 100644 src/runtime/CL/functions/CLStridedSlice.cpp + create mode 100644 src/runtime/CL/functions/CLTopKV2.cpp + create mode 100644 src/runtime/topk_v2.h + +diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h +index 5112476..017650f 100644 +--- a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h ++++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h +@@ -53,17 +53,17 @@ public: + ~CLArithmeticAdditionKernel() = default; + /** Initialise the kernel's inputs, output and convertion policy. + * +- * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32. +- * @param[in] input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. +- * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. ++ * @param[in] input1 First tensor input. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. ++ * @param[in] input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8(only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32. ++ * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAdditionKernel + * +- * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32. +- * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. +- * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. ++ * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. ++ * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8(only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32. ++ * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status +diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h +index c5f862a..5e374a5 100644 +--- a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h ++++ b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT +@@ -74,6 +75,7 @@ public: + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; ++ BorderSize border_size() const override; + + private: + const ICLTensor *_input1; /**< Source tensor 1 */ +diff --git a/arm_compute/core/CL/kernels/CLCastKernel.h b/arm_compute/core/CL/kernels/CLCastKernel.h +new file mode 100644 +index 0000000..19e482f +--- /dev/null ++++ b/arm_compute/core/CL/kernels/CLCastKernel.h +@@ -0,0 +1,65 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ ++#define __ARM_COMPUTE_CLCASTKERNEL_H__ ++ ++#include "arm_compute/core/CL/ICLKernel.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** OpenCL kernel to perform a cast operation */ ++class CLCastKernel : public ICLKernel ++{ ++public: ++ /** Default constructor */ ++ CLCastKernel(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLCastKernel(const CLCastKernel &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLCastKernel &operator=(const CLCastKernel &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLCastKernel(CLCastKernel &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLCastKernel &operator=(CLCastKernel &&) = default; ++ /** Default destructor */ ++ ~CLCastKernel() = default; ++ /** Initialise the kernel's input and output. ++ * ++ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. ++ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. ++ */ ++ void configure(const ICLTensor *input, ICLTensor *output); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ const ICLTensor *_input; /**< Source tensor */ ++ ICLTensor *_output; /**< Destination tensor */ ++}; ++} // namespace arm_compute ++#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */ +diff --git a/arm_compute/core/CL/kernels/CLGatherKernel.h b/arm_compute/core/CL/kernels/CLGatherKernel.h +new file mode 100644 +index 0000000..530491a +--- /dev/null ++++ b/arm_compute/core/CL/kernels/CLGatherKernel.h +@@ -0,0 +1,77 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__ ++#define __ARM_COMPUTE_CLGATHERKERNEL_H__ ++ ++#include "arm_compute/core/CL/ICLKernel.h" ++#include "arm_compute/core/Types.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Interface for the gather kernel. ++ * ++ */ ++class CLGatherKernel : public ICLKernel ++{ ++public: ++ /** Default constructor.*/ ++ CLGatherKernel(); ++ /** Prevent instances of this class from being copied (As this class contains pointers). */ ++ CLGatherKernel(const CLGatherKernel &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers). */ ++ CLGatherKernel &operator=(const CLGatherKernel &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLGatherKernel(CLGatherKernel &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLGatherKernel &operator=(CLGatherKernel &&) = default; ++ /** Initialise the kernel's input, output and border mode. ++ * ++ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. ++ * @param[in] input2 An input tensor. Data types supported: S32. ++ * @param[out] output The output tensor, Data types supported: same as @p input1. ++ */ ++ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); ++ /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel ++ * ++ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. ++ * @param[in] input2 An input tensor. Data types supported: S32. ++ * @param[out] output The output tensor, Data types supported: same as @p input1. ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ const ICLTensor *_input1; ++ const ICLTensor *_input2; ++ ICLTensor *_output; ++}; ++} // namespace arm_compute ++#endif /*__ARM_COMPUTE_CLGATHERKERNEL_H__ */ +diff --git a/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h +new file mode 100644 +index 0000000..2e542b3 +--- /dev/null ++++ b/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h +@@ -0,0 +1,88 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ ++#define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ ++ ++#include "arm_compute/core/CL/ICLKernel.h" ++#include "arm_compute/core/Types.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Interface for the pixelwise division kernel. ++ * ++ */ ++class CLPixelWiseDivisionKernel : public ICLKernel ++{ ++public: ++ /** Default constructor.*/ ++ CLPixelWiseDivisionKernel(); ++ /** Prevent instances of this class from being copied (As this class contains pointers). */ ++ CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers). */ ++ CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default; ++ /** Initialise the kernel's input, output and border mode. ++ * ++ * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in] input2 An input tensor. Data types supported: same as @p input1. ++ * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * @param[in] scale Scale to apply after division. ++ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. ++ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate ++ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. ++ */ ++ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, ++ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); ++ /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseDivisionKernel ++ * ++ * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in] input2 An input tensor info. Data types supported: same as @p input1. ++ * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * @param[in] scale Scale to apply after division. ++ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. ++ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate ++ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ++ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ BorderSize border_size() const override; ++ ++private: ++ const ICLTensor *_input1; ++ const ICLTensor *_input2; ++ ICLTensor *_output; ++}; ++} // namespace arm_compute ++#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ */ +diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h +index fcabb61..66c0b36 100644 +--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h ++++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h +@@ -49,9 +49,9 @@ public: + CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default; + /** Initialise the kernel's input, output and border mode. + * +- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in] input1 An input tensor. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor. Data types supported: same as @p input1. +- * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). QASYMM8 requires both inputs are QASYMM8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate +@@ -61,9 +61,9 @@ public: + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel + * +- * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. +- * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). QASYMM8 requires both inputs are QASYMM8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate +diff --git a/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/arm_compute/core/CL/kernels/CLReduceMaxKernel.h +new file mode 100644 +index 0000000..184389a +--- /dev/null ++++ b/arm_compute/core/CL/kernels/CLReduceMaxKernel.h +@@ -0,0 +1,78 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ ++#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ ++ ++#include "arm_compute/core/CL/ICLKernel.h" ++#include "arm_compute/core/Types.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Interface for the pixelwise division kernel. ++ * ++ */ ++class CLReduceMaxKernel : public ICLKernel ++{ ++public: ++ /** Default constructor.*/ ++ CLReduceMaxKernel(); ++ /** Prevent instances of this class from being copied (As this class contains pointers). */ ++ CLReduceMaxKernel(const CLReduceMaxKernel &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers). */ ++ CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLReduceMaxKernel(CLReduceMaxKernel &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default; ++ /** Initialise the kernel's input, output and border mode. ++ * ++ * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in] axis Axis to reduce ++ * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ */ ++ void configure(const ICLTensor *input, int32_t axis, ICLTensor *output); ++ /** Static function to check if given info will lead to a valid configuration of @ref CLReduceMaxKernel ++ * ++ * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in] axis Axis to reduce ++ * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ void run_on_cpu(cl::CommandQueue &queue); ++ ++private: ++ const ICLTensor *_input; ++ ICLTensor *_output; ++ int32_t _axis; ++}; ++} // namespace arm_compute ++#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */ +diff --git a/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/arm_compute/core/CL/kernels/CLReductionMeanKernel.h +new file mode 100644 +index 0000000..687fdb5 +--- /dev/null ++++ b/arm_compute/core/CL/kernels/CLReductionMeanKernel.h +@@ -0,0 +1,83 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ ++#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ ++ ++#include "arm_compute/core/CL/ICLKernel.h" ++#include "arm_compute/core/Types.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Interface for the reduction operation kernel */ ++class CLReductionMeanKernel : public ICLKernel ++{ ++public: ++ /** Default constructor */ ++ CLReductionMeanKernel(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLReductionMeanKernel(const CLReductionMeanKernel &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLReductionMeanKernel(CLReductionMeanKernel &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default; ++ /** Default destructor */ ++ ~CLReductionMeanKernel() = default; ++ ++ /** Set the input and output tensors. ++ * ++ * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. ++ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. ++ * Output will have the same number of dimensions as input. ++ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1 ++ */ ++ void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis); ++ ++ /** Static function to check if given info will lead to a valid configuration of @ref CLReductionMeanKernel. ++ * ++ * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. ++ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. ++ * Output will have the same number of dimensions as input. ++ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1 ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ BorderSize border_size() const override; ++ ++private: ++ const ICLTensor *_input; ++ ICLTensor *_output; ++ std::vector<uint32_t> _reduction_axis; ++ BorderSize _border_size; ++}; ++} // namespace arm_compute ++#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */ +diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h +new file mode 100644 +index 0000000..456c27d +--- /dev/null ++++ b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h +@@ -0,0 +1,106 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ ++#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ ++ ++#include "arm_compute/core/CL/ICLKernel.h" ++#include "arm_compute/core/Types.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Interface for the kernel to extract a strided slice of a tensor */ ++class CLStridedSliceKernel : public ICLKernel ++{ ++public: ++ /** Default constructor */ ++ CLStridedSliceKernel(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLStridedSliceKernel(const CLStridedSliceKernel &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLStridedSliceKernel(CLStridedSliceKernel &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default; ++ /** Default destructor */ ++ ~CLStridedSliceKernel() = default; ++ /** Set the input and output of the kernel ++ * ++ * @param[in] input Source tensor. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 ++ * @param[out] output Destination tensor. Data type supported: Same as @p input ++ * @param[in] beginData The begin tensor. Data types supported: S32. ++ * The number of dimensions must be 1. ++ * The length must be the same as the number of dimensions of input. ++ * @param[in] endData The end tensor. Data types supported: S32. ++ * The number of dimensions must be 1. ++ * The length must be the same as the number of dimensions of input. ++ * @param[in] strideData The stride tensor. Data types supported: S32. ++ * The number of dimensions must be 1. ++ * The length must be the same as the number of dimensions of input. ++ * @param[in] beginMask Mask for begin ++ * @param[in] endMask Mask for end ++ * @param[in] shrinkAxisMask Mask for shrink axis. ++ * ++ */ ++ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask); ++ ++ /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel ++ * ++ * @param[in] input The input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 ++ * @param[in] output The output tensor info, Data types supported: same as @p input1. ++ * @param[in] begin The begin tensor info. Data types supported: S32. ++ * The number of dimensions must be 1. ++ * The length must be the same as the number of dimensions of input. ++ * @param[in] end The end tensor info. Data types supported: S32. ++ * The number of dimensions must be 1. ++ * The length must be the same as the number of dimensions of input. ++ * @param[in] stride The stride tensor info. Data types supported: S32. ++ * The number of dimensions must be 1. ++ * The length must be the same as the number of dimensions of input. ++ * @param[in] beginMask Mask for begin ++ * @param[in] endMask Mask for end ++ * @param[in] shrinkAxisMask Mask for shrink axis. ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *begin, const ITensorInfo *end, const ITensorInfo *stride, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ const ICLTensor *_input; /** Source tensor */ ++ ICLTensor *_output; /** Destination tensor */ ++ ICLTensor *_beginData; /** Start indices of input tensor */ ++ ICLTensor *_endData; /** Stop indices of input tensor */ ++ ICLTensor *_stridesData; /** Strides tensor */ ++ int32_t _beginMask; /** Begin mask */ ++ int32_t _endMask; /** End mask */ ++ int32_t _shrinkAxisMask; /** Shrink axis mask */ ++}; ++} // namespace arm_compute ++#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */ +diff --git a/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/arm_compute/core/CL/kernels/CLTopKV2Kernel.h +new file mode 100644 +index 0000000..09bcfe5 +--- /dev/null ++++ b/arm_compute/core/CL/kernels/CLTopKV2Kernel.h +@@ -0,0 +1,309 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ ++#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ ++ ++#include "arm_compute/core/CL/ICLArray.h" ++#include "arm_compute/core/CL/ICLKernel.h" ++ ++#include <array> ++ ++// these parameters can be changed ++#define _ITEMS 16 // number of items in a group ++#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS ++#define _HISTOSPLIT (_ITEMS*_GROUPS/2) // number of splits of the histogram ++#define PERMUT // store the final permutation ++//////////////////////////////////////////////////////// ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++class CLTopKV2Single : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLTopKV2Single(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2Single(const CLTopKV2Single &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2Single(CLTopKV2Single &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2Single &operator=(CLTopKV2Single &&) = default; ++ ++ void configure(ICLTensor *input, ICLTensor *topk_values, ++ ICLTensor *topk_indices, cl::Buffer *indices, ++ cl::Buffer *temp_stack, int k, int n); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ ICLTensor *_input; ++ ICLTensor *_topk_values; ++ ICLTensor *_topk_indices; ++}; ++ ++class CLTopKV2Init : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLTopKV2Init(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2Init(const CLTopKV2Init &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2Init(CLTopKV2Init &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2Init &operator=(CLTopKV2Init &&) = default; ++ ++ void configure(ICLTensor *input, cl::Buffer* in_key_buf, ++ cl::Buffer* in_ind_buf, int n); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ ICLTensor *_input; ++}; ++ ++class CLRadixSortHistogram : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLRadixSortHistogram(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortHistogram(CLRadixSortHistogram &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; ++ ++ void configure(cl::Buffer* hist_buf, int bits, int n); ++ ++ void setPass(int pass, cl::Buffer *in_key_buf) { ++ _pass = pass; ++ _in_key_buf = in_key_buf; ++ } ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ int _pass; ++ cl::Buffer *_in_key_buf; ++}; ++ ++class CLRadixSortScanHistogram : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLRadixSortScanHistogram(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; ++ ++ void configure(cl::Buffer* hist_buf, cl::Buffer* glob_sum_buf, int bits); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++}; ++ ++class CLRadixSortGlobalScanHistogram : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLRadixSortGlobalScanHistogram(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; ++ ++ void configure(cl::Buffer* glob_sum_buf, cl::Buffer* temp_buf, int bits); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++}; ++ ++class CLRadixSortPasteHistogram : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLRadixSortPasteHistogram(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; ++ ++ void configure(cl::Buffer* hist_buf, cl::Buffer* glob_sum_buf, int bits); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++}; ++ ++class CLRadixSortReorder : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLRadixSortReorder(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortReorder(const CLRadixSortReorder &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortReorder(CLRadixSortReorder &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; ++ ++ void configure( cl::Buffer *hist_buf, int bits, int n); ++ ++ void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, ++ cl::Buffer *in_ind_buf, cl::Buffer *out_ind_buf) { ++ _pass = pass; ++ _in_key_buf = in_key_buf; ++ _out_key_buf = out_key_buf; ++ _in_ind_buf = in_ind_buf; ++ _out_ind_buf = out_ind_buf; ++ } ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ int _pass; ++ cl::Buffer *_in_key_buf; ++ cl::Buffer *_out_key_buf; ++ cl::Buffer *_in_ind_buf; ++ cl::Buffer *_out_ind_buf; ++}; ++ ++class CLTopKV2FindFirstNegative : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLTopKV2FindFirstNegative(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; ++ ++ void configure(cl::Buffer *first_negative_idx_buf, int n); ++ ++ void setOutputBuffer(cl::Buffer* out_key_buf) { ++ _out_key_buf = out_key_buf; ++ } ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ cl::Buffer *_out_key_buf; ++}; ++ ++class CLTopKV2ReorderNegatives : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLTopKV2ReorderNegatives(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; ++ ++ void configure(cl::Buffer *first_negative_idx_buf, int n); ++ ++ void setBuffers(cl::Buffer *in_key_buf, cl::Buffer* out_key_buf, ++ cl::Buffer *in_ind_buf, cl::Buffer *out_ind_buf) { ++ _in_key_buf = in_key_buf; ++ _out_key_buf = out_key_buf; ++ _in_ind_buf = in_ind_buf; ++ _out_ind_buf = out_ind_buf; ++ } ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++ ++private: ++ cl::Buffer *_in_key_buf; ++ cl::Buffer *_out_key_buf; ++ cl::Buffer *_in_ind_buf; ++ cl::Buffer *_out_ind_buf; ++}; ++ ++class CLTopKV2Store : public ICLKernel ++{ ++public: ++ /** Constructor */ ++ CLTopKV2Store(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2Store(const CLTopKV2Store &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2Store(CLTopKV2Store &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2Store &operator=(CLTopKV2Store &&) = default; ++ ++ void configure(ICLTensor *values, ICLTensor *indices, int k, int n); ++ ++ void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); ++ ++ // Inherited methods overridden: ++ void run(const Window &window, cl::CommandQueue &queue) override; ++private: ++ ICLTensor *_values; ++ ICLTensor *_indices; ++ cl::Buffer *_out_key_buf; ++ cl::Buffer *_out_ind_buf; ++}; ++ ++} // namespace arm_compute ++ ++#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__ +diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl +index b359811..b588d08 100644 +--- a/arm_compute/core/Helpers.inl ++++ b/arm_compute/core/Helpers.inl +@@ -300,6 +300,39 @@ inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo q + return false; + } + ++inline ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape, InterpolationPolicy policy, BorderSize border_size, bool border_undefined) ++{ ++ const auto wr = static_cast<float>(dst_shape[0]) / static_cast<float>(src_info.tensor_shape()[0]); ++ const auto hr = static_cast<float>(dst_shape[1]) / static_cast<float>(src_info.tensor_shape()[1]); ++ ++ ValidRegion valid_region{ Coordinates(), dst_shape, src_info.tensor_shape().num_dimensions() }; ++ ++ Coordinates &anchor = valid_region.anchor; ++ TensorShape &shape = valid_region.shape; ++ ++ anchor.set(0, (policy == InterpolationPolicy::BILINEAR ++ && border_undefined) ? ++ ((static_cast<int>(src_info.valid_region().anchor[0] + border_size.left + 0.5f)) * wr - 0.5f) : ++ ((static_cast<int>(src_info.valid_region().anchor[0] + 0.5f)) * wr - 0.5f)); ++ anchor.set(1, (policy == InterpolationPolicy::BILINEAR ++ && border_undefined) ? ++ ((static_cast<int>(src_info.valid_region().anchor[1] + border_size.top + 0.5f)) * hr - 0.5f) : ++ ((static_cast<int>(src_info.valid_region().anchor[1] + 0.5f)) * hr - 0.5f)); ++ float shape_out_x = (policy == InterpolationPolicy::BILINEAR ++ && border_undefined) ? ++ ((static_cast<int>(src_info.valid_region().anchor[0]) + static_cast<int>(src_info.valid_region().shape[0]) - 1) - 1 + 0.5f) * wr - 0.5f : ++ ((static_cast<int>(src_info.valid_region().anchor[0]) + static_cast<int>(src_info.valid_region().shape[0])) + 0.5f) * wr - 0.5f; ++ float shape_out_y = (policy == InterpolationPolicy::BILINEAR ++ && border_undefined) ? ++ ((static_cast<int>(src_info.valid_region().anchor[1]) + static_cast<int>(src_info.valid_region().shape[1]) - 1) - 1 + 0.5f) * hr - 0.5f : ++ ((static_cast<int>(src_info.valid_region().anchor[1]) + static_cast<int>(src_info.valid_region().shape[1])) + 0.5f) * hr - 0.5f; ++ ++ shape.set(0, shape_out_x - anchor[0]); ++ shape.set(1, shape_out_y - anchor[1]); ++ ++ return valid_region; ++} ++ + inline Coordinates index2coords(const TensorShape &shape, int index) + { + int num_elements = shape.total_size(); +diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h +index fe90b09..8396b9f 100644 +--- a/arm_compute/runtime/CL/CLFunctions.h ++++ b/arm_compute/runtime/CL/CLFunctions.h +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT +@@ -37,6 +38,7 @@ + #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h" + #include "arm_compute/runtime/CL/functions/CLBox3x3.h" + #include "arm_compute/runtime/CL/functions/CLCannyEdge.h" ++#include "arm_compute/runtime/CL/functions/CLCast.h" + #include "arm_compute/runtime/CL/functions/CLChannelCombine.h" + #include "arm_compute/runtime/CL/functions/CLChannelExtract.h" + #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h" +@@ -62,6 +64,7 @@ + #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" + #include "arm_compute/runtime/CL/functions/CLFloor.h" + #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" ++#include "arm_compute/runtime/CL/functions/CLGather.h" + #include "arm_compute/runtime/CL/functions/CLGEMM.h" + #include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h" + #include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h" +@@ -94,11 +97,14 @@ + #include "arm_compute/runtime/CL/functions/CLPermute.h" + #include "arm_compute/runtime/CL/functions/CLPhase.h" + #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h" ++#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h" + #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h" + #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" + #include "arm_compute/runtime/CL/functions/CLRNNLayer.h" + #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h" ++#include "arm_compute/runtime/CL/functions/CLReduceMax.h" + #include "arm_compute/runtime/CL/functions/CLReductionOperation.h" ++#include "arm_compute/runtime/CL/functions/CLReductionMean.h" + #include "arm_compute/runtime/CL/functions/CLRemap.h" + #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" + #include "arm_compute/runtime/CL/functions/CLScale.h" +@@ -107,6 +113,7 @@ + #include "arm_compute/runtime/CL/functions/CLSobel5x5.h" + #include "arm_compute/runtime/CL/functions/CLSobel7x7.h" + #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" ++#include "arm_compute/runtime/CL/functions/CLStridedSlice.h" + #include "arm_compute/runtime/CL/functions/CLTableLookup.h" + #include "arm_compute/runtime/CL/functions/CLThreshold.h" + #include "arm_compute/runtime/CL/functions/CLTranspose.h" +@@ -115,5 +122,6 @@ + #include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h" + #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h" + #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h" ++#include "arm_compute/runtime/CL/functions/CLTopKV2.h" + + #endif /* __ARM_COMPUTE_CLFUNCTIONS_H__ */ +diff --git a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h +index 5b2fc8c..86dc2ef 100644 +--- a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h ++++ b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h +@@ -41,19 +41,19 @@ class CLArithmeticAddition : public ICLSimpleFunction + public: + /** Initialise the kernel's inputs, output and convertion policy. + * +- * @param[in, out] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in, out] input1 First tensor input. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. +- * @param[in, out] input2 Second tensor input. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. ++ * @param[in, out] input2 Second tensor input. Data types supported: U8, QS8 (only if @p input1 is QS8), QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. +- * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. ++ * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAddition + * +- * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32. +- * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. +- * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. ++ * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. ++ * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32. ++ * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status +diff --git a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h +index 0d3f5bc..6d76c70 100644 +--- a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h ++++ b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT +@@ -42,12 +43,14 @@ class CLArithmeticSubtraction : public ICLSimpleFunction + public: + /** Initialise the kernel's inputs, output and convertion policy. + * +- * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32. +- * @param[in] input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. +- * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. +- * @param[in] policy Policy to use to handle overflow. ++ * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. ++ * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. ++ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. ++ * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. ++ * @param[in] policy Policy to use to handle overflow. + */ +- void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); ++ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtraction + * + * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32. +diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h +new file mode 100644 +index 0000000..49fd342 +--- /dev/null ++++ b/arm_compute/runtime/CL/functions/CLCast.h +@@ -0,0 +1,52 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLCAST_H__ ++#define __ARM_COMPUTE_CLCAST_H__ ++ ++#include "arm_compute/core/Types.h" ++#include "arm_compute/runtime/CL/ICLSimpleFunction.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Basic function to run @ref CLCastKernel ++ * ++ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. ++ * @note The function converts the input tensor to the tensor of the output tensor's type. ++ */ ++class CLCast : public ICLSimpleFunction ++{ ++public: ++ /** Initialise the kernel's input and output. ++ * ++ * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. ++ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel. ++ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. ++ */ ++ void configure(ICLTensor *input, ICLTensor *output); ++}; ++} ++#endif /* __ARM_COMPUTE_CLCAST_H__ */ +diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h +new file mode 100644 +index 0000000..1aae32e +--- /dev/null ++++ b/arm_compute/runtime/CL/functions/CLGather.h +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLGATHER_H__ ++#define __ARM_COMPUTE_CLGATHER_H__ ++ ++#include "arm_compute/core/Types.h" ++#include "arm_compute/runtime/CL/ICLSimpleFunction.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Basic function to run @ref CLGatherKernel. */ ++class CLGather : public ICLSimpleFunction ++{ ++public: ++ /** Initialise the kernel's inputs, output and convertion policy. ++ * ++ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. ++ * @param[in] input2 An indexes tensor. Data types supported: S32. ++ * @param[out] output The output tensor, Data types supported: same as @p input1. ++ */ ++ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); ++ /** Static function to check if given info will lead to a valid configuration of @ref CLGather ++ * ++ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. ++ * @param[in] input2 An indexes tensor. Data types supported: S32. ++ * @param[out] output The output tensor, Data types supported: same as @p input1. ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output); ++}; ++} ++#endif /*__ARM_COMPUTE_CLGATHER_H__ */ +diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h +new file mode 100644 +index 0000000..5008159 +--- /dev/null ++++ b/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h +@@ -0,0 +1,71 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ ++#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ ++ ++#include "arm_compute/core/Types.h" ++#include "arm_compute/runtime/CL/ICLSimpleFunction.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Basic function to run @ref CLPixelWiseDivisionKernel. */ ++class CLPixelWiseDivision : public ICLSimpleFunction ++{ ++public: ++ /** Initialise the kernel's inputs, output and convertion policy. ++ * ++ * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. ++ * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. ++ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. ++ * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * @param[in] scale Scale to apply after multiplication. ++ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. ++ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate ++ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. ++ */ ++ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, ++ ConvertPolicy overflow_policy = ConvertPolicy::WRAP, ++ RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); ++ /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseDivision ++ * ++ * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in] input2 An input tensor info. Data types supported: same as @p input1. ++ * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * @param[in] scale Scale to apply after multiplication. ++ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. ++ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate ++ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ++ float scale = 1.f, ConvertPolicy overflow_policy = ConvertPolicy::WRAP, ++ RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); ++}; ++} ++#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */ +diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h +index 75b67cd..3f2ffcd 100644 +--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h ++++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h +@@ -37,11 +37,11 @@ class CLPixelWiseMultiplication : public ICLSimpleFunction + public: + /** Initialise the kernel's inputs, output and convertion policy. + * +- * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. +- * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). QASYMM8 requires both inputs are QASYMM8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate +@@ -51,9 +51,9 @@ public: + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication + * +- * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. ++ * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. +- * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). ++ * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). QASYMM8 requires both inputs are QASYMM8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate +diff --git a/arm_compute/runtime/CL/functions/CLReduceMax.h b/arm_compute/runtime/CL/functions/CLReduceMax.h +new file mode 100644 +index 0000000..9cce054 +--- /dev/null ++++ b/arm_compute/runtime/CL/functions/CLReduceMax.h +@@ -0,0 +1,89 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__ ++#define __ARM_COMPUTE_CLREDUCE_MAX_H__ ++ ++#include "arm_compute/runtime/CL/CLArray.h" ++#include "arm_compute/runtime/IFunction.h" ++#include "arm_compute/core/Types.h" ++#include "arm_compute/core/CL/ICLKernel.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Basic function to execute TopK operation. This function calls the following OpenCL kernels: ++ * ++ * -# @ref CLTopKV2Kernel ++ */ ++class CLReduceMax : public IFunction ++{ ++public: ++ /** Constructor */ ++ CLReduceMax(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLReduceMax(const CLReduceMax &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLReduceMax &operator=(const CLReduceMax &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLReduceMax(CLReduceMax &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLReduceMax &operator=(CLReduceMax &&) = default; ++ /** Initialise the kernel's inputs and outputs. ++ * ++ * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size. ++ * ++ * @param[in] input Input image. Data types supported: F32 ++ * @param[in] axis Axis to reduce. Data type supported: S32 ++ * @param[out] output indices related to top k values. Data types supported: F32. ++ */ ++ void configure(ICLTensor *input, int32_t axis, ICLTensor *output); ++ /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseDivision ++ * ++ * @param[in] input Input image. Data types supported: F32 ++ * @param[in] axis Axis to reduce. Data type supported: S32 ++ * @param[out] output indices related to top k values. Data types supported: F32. * ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output); ++ ++ // Inherited methods overridden: ++ void run() override; ++ ++private: ++ ++ void run_on_cpu(); ++ ++ int32_t _axis; ++ ++ ICLTensor *_input; ++ ICLTensor *_output; ++ ++ std::unique_ptr<ICLKernel> _kernel; ++ ++}; ++} ++#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */ +diff --git a/arm_compute/runtime/CL/functions/CLReductionMean.h b/arm_compute/runtime/CL/functions/CLReductionMean.h +new file mode 100644 +index 0000000..1f2a8b5 +--- /dev/null ++++ b/arm_compute/runtime/CL/functions/CLReductionMean.h +@@ -0,0 +1,76 @@ ++/* ++ * Copyright (c) 2017-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__ ++#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__ ++ ++#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" ++#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" ++#include "arm_compute/core/Types.h" ++#include "arm_compute/runtime/CL/CLTensor.h" ++#include "arm_compute/runtime/IFunction.h" ++ ++#include <cstdint> ++#include <memory> ++#include <vector> ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Perform reduction operation. ++ */ ++class CLReductionMean : public IFunction ++{ ++public: ++ /** Default Constructor. ++ */ ++ CLReductionMean(); ++ ++ /** Set the input and output tensors. ++ * ++ * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. ++ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. ++ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1 ++ */ ++ void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis); ++ ++ /** Static function to check if given info will lead to a valid configuration of @ref CLReductionMean. ++ * ++ * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. ++ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. ++ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1 ++ * ++ * @return a status ++ */ ++ static Status validate(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis); ++ ++ // Inherited methods overridden: ++ void run() override; ++ ++private: ++ CLReductionMeanKernel _reduction_mean_kernel; ++ CLFillBorderKernel _fill_border_kernel; ++}; ++} ++#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */ +diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h +new file mode 100644 +index 0000000..4f765bd +--- /dev/null ++++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h +@@ -0,0 +1,73 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__ ++#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__ ++ ++#include "arm_compute/runtime/IFunction.h" ++#include "arm_compute/runtime/CL/ICLSimpleFunction.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Basic function to run @ref CLStridedSliceKernel */ ++class CLStridedSlice : public ICLSimpleFunction ++{ ++public: ++ /** Initialise the kernel's inputs and outputs ++ * ++ * @param[in] input First tensor input. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 ++ * @param[out] output Output tensor. Data type supported: Same as @p input ++ */ ++ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask); ++}; ++ ++class CLStridedSliceCPU : public IFunction ++{ ++public: ++ /** Initialise inputs and outputs ++ * ++ * @param[in] input First tensor input. ++ * @param[out] output Output tensor. ++ */ ++ void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask); ++ ++ void run() override; ++ ++private: ++ void run_on_cpu(); ++ ++ ICLTensor *_input; ++ ICLTensor *_output; ++ ICLTensor *_beginData; ++ ICLTensor *_endData; ++ ICLTensor *_stridesData; ++ int32_t _beginMask; ++ int32_t _endMask; ++ int32_t _shrinkAxisMask; ++}; ++ ++} ++#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */ +diff --git a/arm_compute/runtime/CL/functions/CLTopKV2.h b/arm_compute/runtime/CL/functions/CLTopKV2.h +new file mode 100644 +index 0000000..0dd4287 +--- /dev/null ++++ b/arm_compute/runtime/CL/functions/CLTopKV2.h +@@ -0,0 +1,115 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef __ARM_COMPUTE_CLTOPK_V2_H__ ++#define __ARM_COMPUTE_CLTOPK_V2_H__ ++ ++#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" ++ ++#include "arm_compute/runtime/CL/CLArray.h" ++#include "arm_compute/runtime/IFunction.h" ++ ++namespace arm_compute ++{ ++class ICLTensor; ++ ++/** Basic function to execute TopK operation. This function calls the following OpenCL kernels: ++ * ++ * -# @ref CLTopKV2Kernel ++ */ ++class CLTopKV2 : public IFunction ++{ ++public: ++ /** Constructor */ ++ CLTopKV2(); ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2(const CLTopKV2 &) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ CLTopKV2 &operator=(const CLTopKV2 &) = delete; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2(CLTopKV2 &&) = default; ++ /** Allow instances of this class to be moved */ ++ CLTopKV2 &operator=(CLTopKV2 &&) = default; ++ /** Initialise the kernel's inputs and outputs. ++ * ++ * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size. ++ * ++ * @param[in] input Input image. Data types supported: U8/S16/F32. ++ * @param[in] k The value of `k`. ++ * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if input type is F32. ++ * @param[out] indices indices related to top k values. Data types supported: S32 if input type is U8/S16, F32 if input type is F32. ++ */ ++ void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, ++ int total_bits = 32, int bits = 4); ++ ++ // Inherited methods overridden: ++ void run() override; ++ ++private: ++ ++ void run_on_cpu(); ++ void run_on_gpu(); ++ void run_on_gpu_single_quicksort(); ++ ++ uint32_t _k; ++ uint32_t _total_bits; ++ uint32_t _bits; ++ uint32_t _radix; ++ uint32_t _hist_buf_size; ++ uint32_t _glob_sum_buf_size; ++ uint32_t _n; ++ ++ ICLTensor *_input; ++ ICLTensor *_values; ++ ICLTensor *_indices; ++ ++ cl::Buffer _qs_idx_buf; ++ cl::Buffer _qs_temp_buf; ++ cl::Buffer _hist_buf; ++ cl::Buffer _glob_sum_buf; ++ cl::Buffer _temp_buf; ++ cl::Buffer _first_negative_idx_buf; ++ cl::Buffer _in_key_buf; ++ cl::Buffer _out_key_buf; ++ cl::Buffer _in_ind_buf; ++ cl::Buffer _out_ind_buf; ++ ++ cl::Buffer *_p_in_key_buf; ++ cl::Buffer *_p_out_key_buf; ++ cl::Buffer *_p_in_ind_buf; ++ cl::Buffer *_p_out_ind_buf; ++ ++ CLTopKV2Single _qs_kernel; ++ CLTopKV2Init _init_kernel; ++ CLRadixSortHistogram _hist_kernel; ++ CLRadixSortScanHistogram _scan_hist_kernel; ++ CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel; ++ CLRadixSortPasteHistogram _paste_hist_kernel; ++ CLRadixSortReorder _reorder_kernel; ++ CLTopKV2FindFirstNegative _find_first_negative_kernel; ++ CLTopKV2ReorderNegatives _reorder_negatives_kernel; ++ CLTopKV2Store _store_kernel; ++}; ++} ++#endif // __ARM_COMPUTE_CLTOPK_V2_H__ +diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp +index bdb26f8..0c9f108 100644 +--- a/src/core/CL/CLKernelLibrary.cpp ++++ b/src/core/CL/CLKernelLibrary.cpp +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT +@@ -149,14 +150,19 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = + { "accumulate_weighted", "accumulate.cl" }, + { "activation_layer", "activation_layer.cl" }, + { "activation_layer_qa8", "activation_layer_qa8.cl" }, ++ { "activation_layer_logistic_qa8", "activation_layer_qa8.cl" }, + { "arithmetic_add", "arithmetic_op.cl" }, + { "arithmetic_sub", "arithmetic_op.cl" }, ++ { "arithmetic_add_qasymm8", "arithmetic_op_quantized.cl" }, + { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" }, + { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" }, + { "bitwise_or", "bitwise_op.cl" }, + { "bitwise_and", "bitwise_op.cl" }, + { "bitwise_xor", "bitwise_op.cl" }, + { "bitwise_not", "bitwise_op.cl" }, ++ { "cast", "cast.cl" }, ++ { "cast_qasymm_in", "cast.cl" }, ++ { "cast_qasymm_out", "cast.cl" }, + { "channel_combine_NV", "channel_combine.cl" }, + { "channel_combine_RGB888", "channel_combine.cl" }, + { "channel_combine_RGBA8888", "channel_combine.cl" }, +@@ -221,6 +227,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = + { "fill_image_borders_replicate", "fill_border.cl" }, + { "finalize", "optical_flow_pyramid_lk.cl" }, + { "floor_layer", "floor.cl" }, ++ { "gather", "gather.cl" }, ++ { "gather_1d", "gather.cl" }, ++ { "gather_1d_out", "gather.cl" }, + { "gaussian1x5_sub_x", "gaussian_pyramid.cl" }, + { "gaussian5x1_sub_y", "gaussian_pyramid.cl" }, + { "gemm_accumulate_biases", "gemm.cl" }, +@@ -313,6 +322,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = + { "permute_3201", "permute.cl" }, + { "pixelwise_mul_float", "pixelwise_mul_float.cl" }, + { "pixelwise_mul_int", "pixelwise_mul_int.cl" }, ++ { "pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl" }, ++ { "pixelwise_div_float", "pixelwise_div_float.cl" }, ++ { "pixelwise_div_int", "pixelwise_div_int.cl" }, + { "pooling_layer_2", "pooling_layer.cl" }, + { "pooling_layer_3", "pooling_layer.cl" }, + { "pooling_layer_optimized_3", "pooling_layer.cl" }, +@@ -322,7 +334,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = + { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" }, + { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" }, + { "quantization_layer", "quantization_layer.cl" }, ++ { "reduce_max", "reduce_max.cl"}, + { "reduction_operation", "reduction_operation.cl" }, ++ { "reduction_mean", "reduction_mean.cl" }, + { "remap_nearest_neighbour", "remap.cl" }, + { "remap_bilinear", "remap.cl" }, + { "reshape_layer", "reshape_layer.cl" }, +@@ -350,6 +364,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = + { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" }, + { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" }, + { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" }, ++ { "strided_slice", "strided_slice.cl" }, + { "suppress_non_maximum", "canny.cl" }, + { "tablelookup_U8", "tablelookup.cl" }, + { "tablelookup_S16", "tablelookup.cl" }, +@@ -378,6 +393,15 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = + { "YUYV422_to_NV12_bt709", "color_convert.cl" }, + { "YUYV422_to_RGB888_bt709", "color_convert.cl" }, + { "YUYV422_to_RGBA8888_bt709", "color_convert.cl" }, ++ { "topkv2_init", "topkv2.cl" }, ++ { "topkv2_find_first_negative", "topkv2.cl" }, ++ { "topkv2_reorder_negatives", "topkv2.cl" }, ++ { "topkv2_store", "topkv2.cl" }, ++ { "radixsort_histogram", "topkv2_radixsort.cl" }, ++ { "radixsort_scanhistograms", "topkv2_radixsort.cl" }, ++ { "radixsort_pastehistograms", "topkv2_radixsort.cl" }, ++ { "radixsort_reorder", "topkv2_radixsort.cl" }, ++ { "topkv2_quicksort", "topkv2_quicksort.cl" }, + }; + + const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = +@@ -404,6 +428,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = + #include "./cl_kernels/arithmetic_op.clembed" + }, + { ++ "arithmetic_op_quantized.cl", ++#include "./cl_kernels/arithmetic_op_quantized.clembed" ++ }, ++ { + "bitwise_op.cl", + #include "./cl_kernels/bitwise_op.clembed" + }, +@@ -412,6 +440,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = + #include "./cl_kernels/canny.clembed" + }, + { ++ "cast.cl", ++#include "./cl_kernels/cast.clembed" ++ }, ++ { + "channel_combine.cl", + #include "./cl_kernels/channel_combine.clembed" + }, +@@ -532,6 +564,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = + #include "./cl_kernels/floor.clembed" + }, + { ++ "gather.cl", ++#include "./cl_kernels/gather.clembed" ++ }, ++ { + "gaussian_pyramid.cl", + #include "./cl_kernels/gaussian_pyramid.clembed" + }, +@@ -636,6 +672,18 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = + #include "./cl_kernels/pixelwise_mul_int.clembed" + }, + { ++ "pixelwise_mul_quantized.cl", ++#include "./cl_kernels/pixelwise_mul_quantized.clembed" ++ }, ++ { ++ "pixelwise_div_float.cl", ++#include "./cl_kernels/pixelwise_div_float.clembed" ++ }, ++ { ++ "pixelwise_div_int.cl", ++#include "./cl_kernels/pixelwise_div_int.clembed" ++ }, ++ { + "pooling_layer.cl", + #include "./cl_kernels/pooling_layer.clembed" + }, +@@ -648,10 +696,18 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = + #include "./cl_kernels/quantization_layer.clembed" + }, + { ++ "reduce_max.cl", ++#include "./cl_kernels/reduce_max.clembed" ++ }, ++ { + "reduction_operation.cl", + #include "./cl_kernels/reduction_operation.clembed" + }, + { ++ "reduction_mean.cl", ++#include "./cl_kernels/reduction_mean.clembed" ++ }, ++ { + "remap.cl", + #include "./cl_kernels/remap.clembed" + }, +@@ -684,6 +740,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = + #include "./cl_kernels/softmax_layer_quantized.clembed" + }, + { ++ "strided_slice.cl", ++#include "./cl_kernels/strided_slice.clembed" ++ }, ++ { + "tablelookup.cl", + #include "./cl_kernels/tablelookup.clembed" + }, +@@ -715,6 +775,18 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = + "winograd.cl", + #include "./cl_kernels/winograd.clembed" + }, ++ { ++ "topkv2.cl", ++#include "./cl_kernels/topkv2.clembed" ++ }, ++ { ++ "topkv2_radixsort.cl", ++#include "./cl_kernels/topkv2_radixsort.clembed" ++ }, ++ { ++ "topkv2_quicksort.cl", ++#include "./cl_kernels/topkv2_quicksort.clembed" ++ }, + #endif /* EMBEDDED_KERNELS */ + }; + +diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl +index 66e54ed..5540932 100644 +--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl ++++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl +@@ -21,10 +21,17 @@ + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +-#include "helpers.h" ++#include "helpers_asymm.h" + + #define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + ++// Logistic Activation ++inline TYPE logistic_op(TYPE x) ++{ ++ // This function is a temporary function that is not actually executed. ++ // To keep the existing structure, it is added. ++ return x; ++} + // RELU Activation + inline TYPE relu_op(TYPE x) + { +@@ -119,4 +126,100 @@ __kernel void activation_layer_qa8( + (data, 0, (__global DATA_TYPE *)output.ptr); + } + +-#endif /* defined(ACT) */ +\ No newline at end of file ++#endif /* defined(ACT) */ ++ ++/** This performs a logistic activation function on QASYMM8 inputs. ++ * ++ * @note In order to perform the logistic activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time ++ * ++ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short ++ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 ++ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively. ++ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively. ++ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128. ++ * @note Quantized can be optionally passed at compile time using -DINPUT_MULTIPLIER and -DINPUT_LEFT_SHIFT (if undefined, assume that the original data is used and not scaled separately. ++ * @note Number of integer bits should be given as a preprocessor argument using -DINPUT_INTEGER_BITS=value. e.g. -DINPUT_INTEGER_BITS=4. ++ * @note Number of input range radius should be given at compile time using -DINPUT_RANGE_RADIUS. ++ * ++ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8 ++ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr ++ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) ++ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) ++ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image ++ */ ++__kernel void activation_layer_logistic_qa8( ++ TENSOR3D_DECLARATION(input) ++#ifndef IN_PLACE ++ , ++ TENSOR3D_DECLARATION(output) ++#endif /* not IN_PLACE */ ++) ++{ ++ // Get pixels pointer ++ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); ++#ifdef IN_PLACE ++ Tensor3D output = input; ++#else /* IN_PLACE */ ++ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); ++#endif /* IN_PLACE */ ++ ++ // Load data ++ VEC_DATA_TYPE(int, 16) ++ data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), VEC_DATA_TYPE(int, 16)); ++ ++ VEC_DATA_TYPE(int, 16) ++ result = data; ++ ++#if defined(INPUT_INTEGER_BITS) && defined(INPUT_RANGE_RADIUS) ++ const VEC_DATA_TYPE(int, 16) Q0_one = INT_MAX; ++ const VEC_DATA_TYPE(int, 16) Q0_one_half = (1 << 30); ++ ++ VEC_DATA_TYPE(int, 16) ++ input_val_centered = data; ++#ifdef O1_VAL ++ input_val_centered = data - O1_VAL; ++#endif /* O1_VAL */ ++ ++ VEC_DATA_TYPE(int, 16) result_left = ASYMM_SELECT_USING_MASK(input_val_centered <= -INPUT_RANGE_RADIUS, 1, 0, 16); ++ VEC_DATA_TYPE(int, 16) result_right = ASYMM_SELECT_USING_MASK(input_val_centered >= INPUT_RANGE_RADIUS, 255, 0, 16); ++ ++ VEC_DATA_TYPE(int, 16) input_mask = ASYMM_SELECT_USING_MASK(input_val_centered > -INPUT_RANGE_RADIUS && input_val_centered < INPUT_RANGE_RADIUS, 1, 0, 16); ++ VEC_DATA_TYPE(int, 16) input_val_rescaled = input_val_centered * input_mask; ++#if defined(INPUT_MULTIPLIER) && defined(INPUT_LEFT_SHIFT) ++ if(INPUT_MULTIPLIER > 1) ++ { ++ input_val_rescaled = ASYMM_MULT(input_val_rescaled * (1 << INPUT_LEFT_SHIFT), INPUT_MULTIPLIER, 16); ++ } ++#endif /* defined(INPUT_MULTIPLIER) && defined(INPUT_LEFT_SHIFT) */ ++ ++ VEC_DATA_TYPE(int, 16) mask_if_positive = ASYMM_MASK_IF_NON_ZERO(input_val_rescaled > CONST_0, 16); ++ VEC_DATA_TYPE(int, 16) mask_if_zero = ASYMM_MASK_IF_NON_ZERO(!input_val_rescaled, 16); ++ VEC_DATA_TYPE(int, 16) abs_input = ASYMM_SELECT_USING_MASK(mask_if_positive, input_val_rescaled, -input_val_rescaled, 16); ++ VEC_DATA_TYPE(int, 16) result_exp = ASYMM_EXP_ON_NEGATIVE_VALUES(-abs_input, INPUT_INTEGER_BITS, 16); ++ VEC_DATA_TYPE(int, 16) result_if_positive = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(result_exp, 16); ++ VEC_DATA_TYPE(int, 16) result_if_negative = Q0_one - result_if_positive; ++ VEC_DATA_TYPE(int, 16) result_logistic = ASYMM_SELECT_USING_MASK(mask_if_zero, Q0_one_half, ASYMM_SELECT_USING_MASK(mask_if_positive, result_if_positive, result_if_negative, 16), 16); ++ ++ result_logistic = ASYMM_ROUNDING_DIVIDE_BY_POW2(result_logistic, 23, 16); ++ result_logistic = ASYMM_SELECT_USING_MASK(result_logistic == 256, 255, result_logistic, 16); ++ result_logistic = result_logistic * input_mask; ++ ++ result = result_left + result_right + result_logistic; ++#endif /* defined(INPUT_INTEGER_BITS) && defined(INPUT_RANGE_RADIUS) */ ++ ++ // Store result ++ TYPE tmp = CONVERT(result, TYPE); ++ VSTORE(VEC_SIZE) ++ (tmp, 0, (__global DATA_TYPE *)output.ptr); ++} +diff --git a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +new file mode 100644 +index 0000000..0c0a9ed +--- /dev/null ++++ b/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +@@ -0,0 +1,138 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016, 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers_asymm.h" ++ ++#if defined(FIXED_POINT_POSITION) ++#include "fixed_point.h" ++#endif /* FIXED_POINT_POSITION */ ++ ++#ifdef SATURATE ++#define ADD(x, y) add_sat((x), (y)) ++#define SUB(x, y) sub_sat((x), (y)) ++#else /* SATURATE */ ++#define ADD(x, y) (x) + (y) ++#define SUB(x, y) (x) - (y) ++#endif /* SATURATE */ ++ ++/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 ++ * ++ * The following computations will be performed: ++ * ++ * -# Add offset terms to inputs ++ -# Get scaled value of two inputs ++ * -# Add inputs ++ * -# Add offset terms to final result ++ * -# Multiply each entry of result by result_mult_int ++ * -# Shift the int32 accumulator by result_shift ++ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. ++ * ++ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: ++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar ++ * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT ++ * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT ++ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT ++ * ++ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: ++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar ++ * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: ++ * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f ++ * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: ++ * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 ++ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 ++ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. ++ * ++ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8 ++ * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) ++ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) ++ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor ++ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: QASYMM8 ++ * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) ++ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) ++ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor ++ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: QASYMM8 ++ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) ++ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) ++ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor ++ */ ++__kernel void arithmetic_add_qasymm8( ++ TENSOR3D_DECLARATION(in1), ++ TENSOR3D_DECLARATION(in2), ++ TENSOR3D_DECLARATION(out)) ++{ ++ // Get pixels pointer ++ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); ++ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); ++ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); ++ ++ // Load data ++ VEC_DATA_TYPE(int, 16) ++ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); ++ VEC_DATA_TYPE(int, 16) ++ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); ++ ++ // Get scaled value of two inputs ++ VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); ++ VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); ++ ++ VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); ++ VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; ++ VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; ++ ++ VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); ++ VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); ++ ++ // Add inputs and multiply with a multiplier smaller than 1 ++ VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; ++ VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); ++ out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); ++ ++ VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); ++ ++// TODO: Apply min-max BOUND to support fuse with relu. ++/* ++#if defined(MIN_BOUND) ++ res = max(res, (uchar16)MIN_BOUND); ++#endif // defined(MIN_BOUND) ++#if defined(MAX_BOUND) ++ res = min(res, (uchar16)MAX_BOUND); ++#endif // defined(MAX_BOUND) ++*/ ++ ++ // Store result ++ VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), ++ 0, (__global DATA_TYPE_OUT *)out.ptr); ++} +diff --git a/src/core/CL/cl_kernels/cast.cl b/src/core/CL/cl_kernels/cast.cl +new file mode 100644 +index 0000000..113804c +--- /dev/null ++++ b/src/core/CL/cl_kernels/cast.cl +@@ -0,0 +1,148 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers.h" ++ ++#ifndef SCALE_IN ++#define SCALE_IN 1.0f ++#endif ++#ifndef OFFSET_IN ++#define OFFSET_IN 0 ++#endif ++ ++/** Perform a cast operation on an input tensor. ++ * ++ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float ++ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 ++ * ++ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 ++ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr ++ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) ++ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) ++ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image ++ */ ++__kernel void cast( ++ TENSOR3D_DECLARATION(input), ++ TENSOR3D_DECLARATION(output)) ++{ ++ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); ++ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); ++ ++ VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), ++ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), ++ 0, (__global DATA_TYPE_OUT *)output.ptr); ++} ++ ++ ++/** Perform a cast operation on an QASYMM8 input tensor. ++ * ++ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 ++ * ++ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 ++ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr ++ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) ++ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) ++ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image ++ */ ++__kernel void cast_qasymm_in( ++ TENSOR3D_DECLARATION(input), ++ TENSOR3D_DECLARATION(output)) ++{ ++ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); ++ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); ++ ++ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = ++ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); ++ VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); ++ VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); ++ ++ VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; ++ VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; ++ ++ VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), ++ 0, (__global DATA_TYPE_OUT *)output.ptr); ++} ++ ++ ++/** Perform a cast operation on an QASYMM8 output tensor. ++ * ++ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 ++ * ++ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 ++ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[out] output_ptr Pointer to the destination image. Supported data types: U8 ++ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) ++ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) ++ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) ++ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image ++ */ ++__kernel void cast_qasymm_out( ++ TENSOR3D_DECLARATION(input), ++ TENSOR3D_DECLARATION(output)) ++{ ++ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); ++ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); ++ ++ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = ++ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); ++ VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); ++ VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); ++ ++ VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; ++ VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); ++ ++ VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), ++ 0, (__global DATA_TYPE_OUT *)output.ptr); ++} +diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h +index 46fa645..e2f376b 100644 +--- a/src/core/CL/cl_kernels/fixed_point.h ++++ b/src/core/CL/cl_kernels/fixed_point.h +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT +@@ -298,6 +299,29 @@ MLALQ_SAT_IMPL(qs16x8, qs32x8) + #define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) mlal_sat_##type##x##size((a), (b), (c), (position)) + #define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) + ++/* Division of two fixed point numbers ++ * ++ * @param[in] type the actual data type. ++ * @param[in] itype the intermediate data type. ++ * ++ * @return The result of the fixed point division. ++ */ ++#define DIVQ_IMPL(type, itype) \ ++ inline type div_##type(type VopA, type VopB, int fixed_point_position) \ ++ { \ ++ itype round_val = (itype)(1 << (fixed_point_position - 1)); \ ++ itype res = CONVERT((VopA), itype) / CONVERT((VopB), itype) + round_val; \ ++ return CONVERT((res >> (itype)fixed_point_position), type); \ ++ } ++ ++DIVQ_IMPL(qs8x8, qs16x8) ++DIVQ_IMPL(qs16x8, qs32x8) ++DIVQ_IMPL(qs8x16, qs16x16) ++DIVQ_IMPL(qs16x16, qs32x16) ++ ++#define DIV_OP_EXPAND_STR(a, b, type, size, position) div_##type##x##size((a), (b), (position)) ++#define DIV_OP_EXPAND(a, b, type, size, position) DIV_OP_EXPAND_STR(a, b, type, size, position) ++ + /** Saturate division of two fixed point vectors + * + * @param[in] stype the actual scalar data type. +diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/gather.cl +new file mode 100644 +index 0000000..25e20f5 +--- /dev/null ++++ b/src/core/CL/cl_kernels/gather.cl +@@ -0,0 +1,106 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers.h" ++ ++/** Perform gather ++ * ++ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short ++ * ++ * @param[in] input1_ptr Pointer to the first source tensor. Supported data types: U8/S32/F32 ++ * @param[in] input1_stride_x Stride of the first source tensor in X dimension (in bytes) ++ * @param[in] input1_step_x input_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] input1_stride_y Stride of the first source tensor in Y dimension (in bytes) ++ * @param[in] input1_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] input1_stride_z Stride of the first source tensor in Z dimension (in bytes) ++ * @param[in] input1_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the first source tensor ++ * @param[in] input2_ptr Pointer to the first source tensor. Supported data types: U32 ++ * @param[in] input2_stride_x Stride of the first source tensor in X dimension (in bytes) ++ * @param[in] input2_step_x input_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the first source tensor ++ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr ++ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) ++ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) ++ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) ++ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) ++ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor ++ */ ++__kernel void gather(IMAGE_DECLARATION(input1), ++ VECTOR_DECLARATION(input2), ++ IMAGE_DECLARATION(output)) ++{ ++ Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1); ++ Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); ++ Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output); ++ ++ VEC_DATA_TYPE(DATA_TYPE_IN2, 2) ++ in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); ++ ++ //TODO: performance tuning for memcopy ++ int index = in2_data.s0; ++ int stride=input1_stride_y/input1_stride_x; ++ ++ for(int i=0; i<stride; i++){ ++ *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index)); ++ } ++} ++ ++__kernel void gather_1d_out(IMAGE_DECLARATION(input1), ++ VECTOR_DECLARATION(input2), ++ VECTOR_DECLARATION(output)) ++{ ++ Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1); ++ Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); ++ Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); ++ ++ VEC_DATA_TYPE(DATA_TYPE_IN2, 2) ++ in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); ++ ++ //TODO: performance tuning for memcopy ++ int index = in2_data.s0; ++ int stride=input1_stride_y/input1_stride_x; ++ ++ for(int i=0; i<stride; i++){ ++ *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index)); ++ } ++} ++ ++__kernel void gather_1d(VECTOR_DECLARATION(input1), ++ VECTOR_DECLARATION(input2), ++ VECTOR_DECLARATION(output)) ++{ ++ Vector in1 = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1); ++ Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); ++ Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); ++ ++ VEC_DATA_TYPE(DATA_TYPE_IN2, 2) ++ in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); ++ ++ //TODO: performance tuning for memcopy ++ int index = in2_data.s0; ++ *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index)); ++} +diff --git a/src/core/CL/cl_kernels/pixelwise_div_float.cl b/src/core/CL/cl_kernels/pixelwise_div_float.cl +new file mode 100644 +index 0000000..512c620 +--- /dev/null ++++ b/src/core/CL/cl_kernels/pixelwise_div_float.cl +@@ -0,0 +1,96 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016, 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers.h" ++ ++#ifdef SATURATE ++#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) ++#else /* SATURATE */ ++#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) ++#endif /* SATURATE */ ++#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) ++ ++/** Performs a pixelwise division with float scale of either integer or float inputs. ++ * ++ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: ++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short ++ * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. ++ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. ++ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided. ++ * ++ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 ++ * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) ++ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 ++ * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) ++ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32 ++ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) ++ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) ++ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) ++ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image ++ * @param[in] scale Float scaling factor. Supported data types: F32 ++ */ ++__kernel void pixelwise_div_float( ++ TENSOR3D_DECLARATION(in1), ++ TENSOR3D_DECLARATION(in2), ++ TENSOR3D_DECLARATION(out), ++ const float scale) ++{ ++ // Get pixels pointer ++ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); ++ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); ++ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); ++ ++ // Load data ++ VEC_DATA_TYPE(DATA_TYPE_RES, 16) ++ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); ++ VEC_DATA_TYPE(DATA_TYPE_RES, 16) ++ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); ++ ++ // Perform division ++#ifdef DATA_TYPE_FLOAT ++ VEC_DATA_TYPE(DATA_TYPE_OUT, 16) ++ res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); ++#else /* DATA_TYPE_FLOAT */ ++ VEC_DATA_TYPE(DATA_TYPE_OUT, 16) ++ res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND); ++#endif /* DATA_TYPE_FLOAT */ ++ ++ // Store result ++ vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr); ++} +diff --git a/src/core/CL/cl_kernels/pixelwise_div_int.cl b/src/core/CL/cl_kernels/pixelwise_div_int.cl +new file mode 100644 +index 0000000..82edf3b +--- /dev/null ++++ b/src/core/CL/cl_kernels/pixelwise_div_int.cl +@@ -0,0 +1,103 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016, 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers.h" ++ ++#if defined(FIXED_POINT_POSITION) ++ ++#include "fixed_point.h" ++ ++#if defined(SATURATE) ++#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) ++#else // SATURATE ++#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) ++#endif // SATURATE ++ ++#else // FIXED_POINT_POSITION ++ ++#if defined(SATURATE) ++#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x)) ++#else // SATURATE ++#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x)) ++#endif // SATURATE ++#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size) ++ ++#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size) ++ ++#endif // FIXED_POINT_POSITION ++ ++/** Performs a pixelwise division with integer scale of integer inputs. ++ * ++ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: ++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short ++ * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. ++ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. ++ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3 ++ * ++ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16 ++ * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) ++ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr ++ * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) ++ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr ++ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) ++ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) ++ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) ++ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image ++ * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1). ++ */ ++__kernel void pixelwise_div_int( ++ TENSOR3D_DECLARATION(in1), ++ TENSOR3D_DECLARATION(in2), ++ TENSOR3D_DECLARATION(out), ++ const uint scale) ++{ ++ // Get pixels pointer ++ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); ++ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); ++ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); ++ ++ // Load data ++ VEC_DATA_TYPE(DATA_TYPE_RES, 16) ++ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); ++ VEC_DATA_TYPE(DATA_TYPE_RES, 16) ++ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); ++ ++ // Perform division and store result ++ vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr); ++} +diff --git a/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +new file mode 100644 +index 0000000..ddc9d5a +--- /dev/null ++++ b/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +@@ -0,0 +1,119 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016, 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers_asymm.h" ++ ++#ifdef SATURATE ++#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) ++#else /* SATURATE */ ++#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) ++#endif /* SATURATE */ ++#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) ++ ++#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) ++/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 ++ * ++ * The following computations will be performed by the kernel: ++ * ++ * -# Add offset terms to inputs ++ * -# Multiply inputs ++ * -# Add offset terms to final result ++ * -# Multiply each entry of result by result_mult_int ++ * -# Shift the int32 accumulator by result_shift ++ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. ++ * ++ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: ++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar ++ * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET ++ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT ++ * ++ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8 ++ * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) ++ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[in] in2_ptr Pointer to the source image. Supported data types: U8 ++ * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) ++ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 ++ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) ++ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) ++ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) ++ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) ++ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image ++ * @param[in] scale Float scaling factor. Supported data types: F32 ++ */ ++__kernel void pixelwise_mul_qasymm8( ++ TENSOR3D_DECLARATION(in1), ++ TENSOR3D_DECLARATION(in2), ++ TENSOR3D_DECLARATION(out), ++ const float scale) ++{ ++ // Get pixels pointer ++ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); ++ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); ++ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); ++ ++ // Load data ++ VEC_DATA_TYPE(int, 16) ++ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); ++ VEC_DATA_TYPE(int, 16) ++ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); ++ ++ // Perform multiplication of two inputs ++ VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); ++ VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); ++ VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; ++ ++ // Multiply with a multiplier smaller than 1 ++ out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); ++ out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); ++ ++ VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); ++ ++// TODO: Apply min-max BOUND to support fuse with relu. ++/* ++#if defined(MIN_BOUND) ++ res = max(res, (uchar16)MIN_BOUND); ++#endif // defined(MIN_BOUND) ++#if defined(MAX_BOUND) ++ res = min(res, (uchar16)MAX_BOUND); ++#endif // defined(MAX_BOUND) ++*/ ++ ++ // Store result ++ VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), ++ 0, (__global DATA_TYPE_OUT *)out.ptr); ++} ++#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +diff --git a/src/core/CL/cl_kernels/reduce_max.cl b/src/core/CL/cl_kernels/reduce_max.cl +new file mode 100644 +index 0000000..dfa3b85 +--- /dev/null ++++ b/src/core/CL/cl_kernels/reduce_max.cl +@@ -0,0 +1,60 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers.h" ++ ++#if defined(WIDTH) ++/** Perform reduce max ++ * ++ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short ++ * ++ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 ++ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) ++ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor ++ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr ++ * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes) ++ * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor ++ */ ++__kernel void reduce_max(VECTOR_DECLARATION(input), ++ VECTOR_DECLARATION(output)) ++{ ++ Vector input = CONVERT_TO_VECTOR_STRUCT(input); ++ Vector output = CONVERT_TO_VECTOR_STRUCT(output); ++ ++ __global float *input_addr = (__global float *)(input.ptr); ++ __global float *output_addr = (__global float *)(output.ptr); ++ ++ float max_value = *input_addr; ++ for(int x = 1; x < WIDTH; x++) ++ { ++ float value = *(input_addr + x); ++ max_value = max(value, max_value); ++ } ++ ++ // Store max ++ *output_addr = max_value; ++} ++#endif // defined(WIDTH) +diff --git a/src/core/CL/cl_kernels/reduction_mean.cl b/src/core/CL/cl_kernels/reduction_mean.cl +new file mode 100644 +index 0000000..1a96eea +--- /dev/null ++++ b/src/core/CL/cl_kernels/reduction_mean.cl +@@ -0,0 +1,69 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016, 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers.h" ++ ++inline DATA_TYPE sum_8(__global const DATA_TYPE *input) ++{ ++ VEC_DATA_TYPE(DATA_TYPE, 8) ++ in = vload8(0, input); ++ in.s0123 += in.s4567; ++ in.s01 += in.s23; ++ return ((in.s0 + in.s1)); ++} ++ ++/** This function calculates the sum and sum of squares of a given input image. ++ * ++ * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument. ++ * ++ * @param[in] src_ptr Pointer to the source image. Supported data types: U8 ++ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) ++ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) ++ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) ++ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image ++ * @param[out] local_sum Local sum of all elements ++ * @param[in] height Height of the input image ++ * @param[in] divider Divider to calculate mean ++ */ ++__kernel void reduction_mean( ++ IMAGE_DECLARATION(src), ++ IMAGE_DECLARATION(dst), ++ __local DATA_TYPE *local_sums, ++ int height, ++ int divider) ++{ ++ // Get pixels pointer ++ Image src = CONVERT_TO_IMAGE_STRUCT(src); ++ Image dst = CONVERT_TO_IMAGE_STRUCT(dst); ++ ++ float8 tmp_sum = 0; ++ // Calculate partial sum ++ ++ for(int i = 0; i < height; i++) ++ { ++ local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i)); ++ } ++ ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider; ++} +diff --git a/src/core/CL/cl_kernels/strided_slice.cl b/src/core/CL/cl_kernels/strided_slice.cl +new file mode 100644 +index 0000000..c5ff82f +--- /dev/null ++++ b/src/core/CL/cl_kernels/strided_slice.cl +@@ -0,0 +1,104 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "helpers.h" ++ ++ ++inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w) ++{ ++ int stride_x = vector->stride_x; ++ int stride_y = stride_x * dim_x; ++ int stride_z = stride_y * dim_y; ++ int stride_w = stride_z * dim_z; ++ Tensor4D tensor = ++ { ++ .ptr = vector->ptr, ++ .offset_first_element_in_bytes = vector->offset_first_element_in_bytes, ++ .stride_x = stride_x, ++ .stride_y = stride_y, ++ .stride_z = stride_z, ++ .stride_w = stride_w, ++ }; ++ return tensor; ++} ++ ++/** Extracts a strided slice up to 4-dimensions ++ * ++ * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short ++ * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2 ++ * ++ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 ++ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) ++ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor ++ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr ++ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) ++ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) ++ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor ++ * @param[in] dims_in The 4-dimensional dimension of the input. Supported data types: S32 ++ * @param[in] dims_out The 4-dimensional dimension of the output. Supported data types: S32 ++ * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 ++ * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 ++ */ ++__kernel void strided_slice(VECTOR_DECLARATION(input), ++ VECTOR_DECLARATION(output), ++ const int4 dims_in, ++ const int4 dims_out, ++ const int4 starts, ++ const int4 strides) ++{ ++ // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset ++ Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); ++ Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); ++ ++ // Implemenation ++ // Infer a Tensor4D from output Vector and output's dimensions info ++ // Infer a Tensor4D from input Vector and input's dimensions info ++ // Infer indices of output as 4D from the offset of output vector ++ // Infer indices of input as 4D from indices of output ++ // out(offset of output vector) = in(offset of input) ++ ++ Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w); ++ Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w); ++ ++ // Must be output_step_x == output_stride_x == an element's size ++ const int offset_out = get_global_id(0) * output_stride_x; ++ int4 indices_out = ++ { ++ get_global_id(0) % dims_out.x, ++ (offset_out / tensor_out.stride_y) % dims_out.y, ++ (offset_out / tensor_out.stride_z) % dims_out.z, ++ (offset_out / tensor_out.stride_w) % dims_out.w, ++ }; ++ ++ int4 indices_in = ++ { ++ starts.x + (strides.x * indices_out.x), ++ starts.y + (strides.y * indices_out.y), ++ starts.z + (strides.z * indices_out.z), ++ starts.w + (strides.w * indices_out.w), ++ }; ++ ++ *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); ++} +diff --git a/src/core/CL/cl_kernels/topkv2.cl b/src/core/CL/cl_kernels/topkv2.cl +new file mode 100644 +index 0000000..0b0cf82 +--- /dev/null ++++ b/src/core/CL/cl_kernels/topkv2.cl +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "helpers.h" ++ ++__kernel void topkv2_init(VECTOR_DECLARATION(input), ++ __global float* in_key_buf, ++ __global int* in_ind_buf, ++ const int n) ++{ ++ int gid = get_global_id(0); ++ int lws = get_local_size(0); ++ int groups = get_num_groups(0); ++ int gws = lws * groups; ++ int iter = n / gws; ++ ++ Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); ++ ++ for(int i = 0; i < iter; ++i) ++ { ++ int idx = i * gws + gid; ++ in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x); ++ in_ind_buf[idx] = idx; ++ } ++} ++ ++__kernel void topkv2_find_first_negative( ++ __global float *out_key_buf, ++ __global int *first_negative_idx, ++ int n) ++{ ++ int gid = get_global_id(0); ++ ++ if( gid == n - 1 ) ++ { ++ // if the last item is positive, the first negative index is n. ++ if( out_key_buf[gid] > 0.f ) ++ *first_negative_idx = n; ++ } else if ( gid == 0 ) { ++ // if the first item is negative, set it 0. ++ if( out_key_buf[gid] < 0.f ) ++ *first_negative_idx = 0; ++ } else { ++ // if its left is positive and it is negative, then it is the first negative item. ++ if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f ) ++ *first_negative_idx = gid; ++ } ++} ++ ++__kernel void topkv2_reorder_negatives( ++ __global float* in_key_buf, ++ __global float* out_key_buf, ++ __global float* in_ind_buf, ++ __global float* out_ind_buf, ++ __global int* first_negative_idx, ++ int n) ++{ ++ int gid = get_global_id(0); ++ ++ int num_negs = n - *first_negative_idx; ++ int in_idx; ++ ++ if( gid < num_negs ) { ++ in_idx = n - 1 - gid; ++ } else { ++ in_idx = gid - num_negs; ++ } ++ ++ out_key_buf[gid] = in_key_buf[in_idx]; ++ out_ind_buf[gid] = in_ind_buf[in_idx]; ++} ++ ++__kernel void topkv2_store( ++ VECTOR_DECLARATION(values), ++ VECTOR_DECLARATION(indices), ++ __global float *out_key_buf, ++ __global int *out_ind_buf, ++ int n) ++{ ++ int gid = get_global_id(0); ++ ++ Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values); ++ Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); ++ ++ int idx = n - 1 - gid; ++ ++ *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; ++ *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; ++} +diff --git a/src/core/CL/cl_kernels/topkv2_quicksort.cl b/src/core/CL/cl_kernels/topkv2_quicksort.cl +new file mode 100644 +index 0000000..deadf84 +--- /dev/null ++++ b/src/core/CL/cl_kernels/topkv2_quicksort.cl +@@ -0,0 +1,138 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "helpers.h" ++ ++__global inline float* get_vec_elem(Vector* vec, int idx) ++{ ++ return (__global float*)(vec->ptr + idx * vec->stride_x); ++} ++ ++__global inline int* get_vec_elem_int(Vector* vec, int idx) ++{ ++ return (__global int*)(vec->ptr + idx * vec->stride_x); ++} ++ ++// A utility function to swap two elements ++void swap(__global float *a, __global float *b) ++{ ++ float t = *a; ++ *a = *b; ++ *b = t; ++} ++ ++void swap_idx(__global int *a, __global int *b) ++{ ++ int t = *a; ++ *a = *b; ++ *b = t; ++} ++ ++/* This function is same in both iterative and recursive*/ ++int partition (Vector* arr, __global int* indices, int l, int h) ++{ ++ float x = *get_vec_elem(arr, h); ++ int i = (l - 1); ++ ++ for (int j = l; j <= h- 1; j++) ++ { ++ if (*get_vec_elem(arr, j) >= x) ++ { ++ i++; ++ swap (get_vec_elem(arr,i), get_vec_elem(arr,j)); ++ swap_idx(&indices[i], &indices[j]); ++ } ++ } ++ swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); ++ swap_idx(&indices[i + 1], &indices[h]); ++ return (i + 1); ++} ++ ++/* A[] --> Array to be sorted, ++ l --> Starting index, ++ h --> Ending index */ ++void quickSortIterative (Vector* arr, __global int* indices, ++ __global int *stack, int l, int h) ++{ ++ // Create an auxiliary stack ++ ++ // initialize top of stack ++ int top = -1; ++ ++ // push initial values of l and h to stack ++ stack[ ++top ] = l; ++ stack[ ++top ] = h; ++ ++ // Keep popping from stack while is not empty ++ while ( top >= 0 ) ++ { ++ // Pop h and l ++ h = stack[ top-- ]; ++ l = stack[ top-- ]; ++ ++ // Set pivot element at its correct position ++ // in sorted array ++ int p = partition( arr, indices, l, h ); ++ ++ // If there are elements on left side of pivot, ++ // then push left side to stack ++ if ( p-1 > l ) ++ { ++ stack[ ++top ] = l; ++ stack[ ++top ] = p - 1; ++ } ++ ++ // If there are elements on right side of pivot, ++ // then push right side to stack ++ if ( p+1 < h ) ++ { ++ stack[ ++top ] = p + 1; ++ stack[ ++top ] = h; ++ } ++ } ++} ++ ++__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), ++ VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices), ++ __global int* indices, __global int* temp_stack, int k, int n) ++{ ++ Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); ++ Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); ++ Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); ++ ++ for( int i = 0; i < n; ++i ) ++ { ++ indices[i] = i; ++ } ++ ++ quickSortIterative(&input, indices, temp_stack, 0, n-1); ++ ++ // extract k items. ++ for(int i = 0; i < k; ++i) ++ { ++ *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); ++ *get_vec_elem_int(&topk_indices, i) = indices[i]; ++ } ++} +diff --git a/src/core/CL/cl_kernels/topkv2_radixsort.cl b/src/core/CL/cl_kernels/topkv2_radixsort.cl +new file mode 100644 +index 0000000..cac0c07 +--- /dev/null ++++ b/src/core/CL/cl_kernels/topkv2_radixsort.cl +@@ -0,0 +1,279 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++// reference: ++// https://code.google.com/archive/p/ocl-radix-sort/source/default/source ++// OpenCL kernel sources for the CLRadixSort class ++// the #include does not exist in OpenCL ++// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr ++// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html ++// if you find this software usefull you can cite the following work in your reports or articles: ++// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. ++// http://hal.archives-ouvertes.fr/hal-00596730 ++ ++// Reference for floating point radix sort: ++// http://www.codercorner.com/RadixSortRevisited.htm ++ ++// compute the histogram for each radix and each virtual processor for the pass ++__kernel void radixsort_histogram(__global float* in_key_buf, ++ __global int* d_Histograms, ++ const int pass, ++ __local int* loc_histo, ++ const int n) ++{ ++ int it = get_local_id(0); // i local number of the processor ++ int ig = get_global_id(0); // global number = i + g I ++ ++ int gr = get_group_id(0); // g group number ++ ++ int groups = get_num_groups(0); ++ int items = get_local_size(0); ++ ++ // set the local histograms to zero ++ for(int ir=0;ir<_RADIX;ir++){ ++ loc_histo[ir * items + it] = 0; ++ } ++ ++ barrier(CLK_LOCAL_MEM_FENCE); ++ ++ // range of keys that are analyzed by the work item ++ int size= n/groups/items; // size of the sub-list ++ int start= ig * size; // beginning of the sub-list ++ ++ unsigned int key; ++ int shortkey,k; ++ ++ // compute the index ++ // the computation depends on the transposition ++ for(int j = 0; j < size ; j++) { ++#ifdef TRANSPOSE ++ k= groups * items * j + ig; ++#else ++ k=j+start; ++#endif ++ ++ key = *((__global unsigned int*)(in_key_buf + k)); ++ ++ // extract the group of _BITS bits of the pass ++ // the result is in the range 0.._RADIX-1 ++ shortkey=(( key >> (pass * _BITS)) & (_RADIX-1)); ++ ++ // increment the local histogram ++ loc_histo[shortkey * items + it ]++; ++ } ++ ++ barrier(CLK_LOCAL_MEM_FENCE); ++ ++ // copy the local histogram to the global one ++ for(int ir=0;ir<_RADIX;ir++) { ++ d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; ++ } ++ ++ barrier(CLK_GLOBAL_MEM_FENCE); ++} ++ ++// initial transpose of the list for improving ++// coalescent memory access ++__kernel void transpose(const __global int* invect, ++ __global int* outvect, ++ const int nbcol, ++ const int nbrow, ++ const __global int* inperm, ++ __global int* outperm, ++ __local int* blockmat, ++ __local int* blockperm, ++ const int tilesize){ ++ ++ int i0 = get_global_id(0)*tilesize; // first row index ++ int j = get_global_id(1); // column index ++ ++ int jloc = get_local_id(1); // local column index ++ ++ // fill the cache ++ for(int iloc=0;iloc<tilesize;iloc++){ ++ int k=(i0+iloc)*nbcol+j; // position in the matrix ++ blockmat[iloc*tilesize+jloc]=invect[k]; ++#ifdef PERMUT ++ blockperm[iloc*tilesize+jloc]=inperm[k]; ++#endif ++ } ++ ++ barrier(CLK_LOCAL_MEM_FENCE); ++ ++ // first row index in the transpose ++ int j0=get_group_id(1)*tilesize; ++ ++ // put the cache at the good place ++ for(int iloc=0;iloc<tilesize;iloc++){ ++ int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose ++ outvect[kt]=blockmat[jloc*tilesize+iloc]; ++#ifdef PERMUT ++ outperm[kt]=blockperm[jloc*tilesize+iloc]; ++#endif ++ } ++ ++} ++ ++// each virtual processor reorders its data using the scanned histogram ++__kernel void radixsort_reorder(__global float* in_key, ++ __global float* out_key, ++ __global int* d_Histograms, ++ const int pass, ++ __global int* indices_in, ++ __global int* indices_out, ++ __local int* loc_histo, ++ const int n){ ++ ++ int it = get_local_id(0); ++ int ig = get_global_id(0); ++ ++ int gr = get_group_id(0); ++ int groups=get_num_groups(0); ++ int items=get_local_size(0); ++ ++ int start= ig *(n/groups/items); ++ int size= n/groups/items; ++ ++ // take the histogram in the cache ++ for(int ir=0;ir<_RADIX;ir++){ ++ loc_histo[ir * items + it]= ++ d_Histograms[items * (ir * groups + gr) + it]; ++ } ++ barrier(CLK_LOCAL_MEM_FENCE); ++ ++ int newpos,shortkey,k,newpost; ++ unsigned int key; ++ ++ for(int j= 0; j< size;j++){ ++#ifdef TRANSPOSE ++ k= groups * items * j + ig; ++#else ++ k=j+start; ++#endif ++ float org_value = in_key[k]; ++ key = *(__global unsigned int*)(in_key + k); ++ shortkey=((key >> (pass * _BITS)) & (_RADIX-1)); ++ ++ newpos=loc_histo[shortkey * items + it]; ++ ++#ifdef TRANSPOSE ++ int ignew,jnew; ++ ignew= newpos/(n/groups/items); ++ jnew = newpos%(n/groups/items); ++ newpost = jnew * (groups*items) + ignew; ++#else ++ newpost=newpos; ++#endif ++ ++ //d_outKeys[newpost]= key; // killing line !!! ++ out_key[newpost] = org_value; ++ ++#ifdef PERMUT ++ indices_out[newpost] = indices_in[k]; ++#endif ++ ++ newpos++; ++ loc_histo[shortkey * items + it]=newpos; ++ } ++} ++ ++// perform a parallel prefix sum (a scan) on the local histograms ++// (see Blelloch 1990) each workitem worries about two memories ++// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html ++__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum) ++{ ++ int it = get_local_id(0); ++ int ig = get_global_id(0); ++ int decale = 1; ++ int n=get_local_size(0) * 2 ; ++ int gr=get_group_id(0); ++ ++ // load input into local memory ++ // up sweep phase ++ temp[2*it] = histo[2*ig]; ++ temp[2*it+1] = histo[2*ig+1]; ++ ++ // parallel prefix sum (algorithm of Blelloch 1990) ++ for (int d = n>>1; d > 0; d >>= 1){ ++ barrier(CLK_LOCAL_MEM_FENCE); ++ if (it < d){ ++ int ai = decale*(2*it+1)-1; ++ int bi = decale*(2*it+2)-1; ++ temp[bi] += temp[ai]; ++ } ++ decale *= 2; ++ } ++ ++ // store the last element in the global sum vector ++ // (maybe used in the next step for constructing the global scan) ++ // clear the last element ++ if (it == 0) { ++ globsum[gr]=temp[n-1]; ++ temp[n - 1] = 0; ++ } ++ ++ // down sweep phase ++ for (int d = 1; d < n; d *= 2){ ++ decale >>= 1; ++ barrier(CLK_LOCAL_MEM_FENCE); ++ ++ if (it < d){ ++ int ai = decale*(2*it+1)-1; ++ int bi = decale*(2*it+2)-1; ++ ++ int t = temp[ai]; ++ temp[ai] = temp[bi]; ++ temp[bi] += t; ++ } ++ ++ } ++ barrier(CLK_LOCAL_MEM_FENCE); ++ ++ // write results to device memory ++ ++ histo[2*ig] = temp[2*it]; ++ histo[2*ig+1] = temp[2*it+1]; ++ ++ barrier(CLK_GLOBAL_MEM_FENCE); ++ ++} ++ ++// use the global sum for updating the local histograms ++// each work item updates two values ++__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum) ++{ ++ int ig = get_global_id(0); ++ int gr=get_group_id(0); ++ ++ int s; ++ ++ s=globsum[gr]; ++ ++ // write results to device memory ++ histo[2*ig] += s; ++ histo[2*ig+1] += s; ++ ++ barrier(CLK_GLOBAL_MEM_FENCE); ++} +diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp +index a78b3e1..4c3ecad 100644 +--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp ++++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp +@@ -33,6 +33,7 @@ + #include "arm_compute/core/Utils.h" + #include "arm_compute/core/Validate.h" + #include "arm_compute/core/Window.h" ++#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + + #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/Types.h" +@@ -49,8 +50,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) +- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU), +- "For QASYMM8 only relu, lower bounded relu and lower-upper bounded relu are supported"); ++ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU) ++ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC), ++ "For QASYMM8 only relu, lower bounded relu, lower-upper bounded relu and logistic are supported"); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) +@@ -93,6 +95,43 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); + } ++ ++inline bool is_activation_logistic(ActivationLayerInfo &act_info) ++{ ++ if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC) ++ { ++ return true; ++ } ++ return false; ++} ++ ++/** Calculates logistic parameters from the quantized input scale and scaling factor for the exponent and places them as build options. ++ * ++ * Prepares these build options: ++ * -INPUT_MULTIPLIER, INPUT_LEFT_SHIFT - quantized representation of multiplier. ++ * -INPUT_RANGE_RADIUS - threshold difference between maximum value of input data and current processed value. ++ * it defines whether the value will be taken into account or not. ++ * ++ * @param[in] build_opts Build options to extend ++ * @param[in] input_scale Input scaling factor ++ */ ++void prepare_quantized_logistic_build_options(std::set<std::string> *build_opts, float input_scale) ++{ ++ // Number of integer bits in temporary fixed-point representation of current-to-max difference ++ static const int input_integer_bits = 4; ++ ++ const double input_real_multiplier = input_scale * (1ll << (31 - input_integer_bits)); ++ int input_multiplier, input_left_shift; ++ quantization::calculate_quantized_multiplier_greater_than_one(input_real_multiplier, &input_multiplier, &input_left_shift); ++ ++ const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) * (1ll << (31 - input_integer_bits)) / (1ll << input_left_shift); ++ const int input_range_radius = std::floor(max_input_rescaled); ++ ++ build_opts->emplace(("-DINPUT_INTEGER_BITS=" + support::cpp11::to_string(input_integer_bits))); ++ build_opts->emplace(("-DINPUT_MULTIPLIER=" + support::cpp11::to_string(input_multiplier))); ++ build_opts->emplace(("-DINPUT_LEFT_SHIFT=" + support::cpp11::to_string(input_left_shift))); ++ build_opts->emplace(("-DINPUT_RANGE_RADIUS=" + support::cpp11::to_string(input_range_radius))); ++} + } // namespace + + CLActivationLayerKernel::CLActivationLayerKernel() +@@ -181,8 +220,16 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act + build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fixed_point_position))); + } + ++ if(is_data_type_quantized_asymmetric(dt) && is_activation_logistic(act_info)) ++ { ++ prepare_quantized_logistic_build_options(&build_opts, input->info()->quantization_info().scale); ++ } ++ + // Create kernel +- std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer"); ++ std::string kernel_name = is_data_type_quantized_asymmetric(dt) && is_activation_logistic(act_info) ? ++ std::string("activation_layer_logistic_qa8") : ++ is_data_type_quantized_asymmetric(dt) ? ++ std::string("activation_layer_qa8") : std::string("activation_layer"); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + // Make sure _kernel is initialized before calling the parent's configure +diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp +index c4904ec..f5f4f1a 100644 +--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp ++++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp +@@ -25,6 +25,7 @@ + + #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + + using namespace arm_compute; + +@@ -36,8 +37,13 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, + { + ARM_COMPUTE_UNUSED(policy); + +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ if (is_data_type_quantized_asymmetric(output.data_type())) ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output, &input1); ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output, &input2); ++ } + + const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + +@@ -47,7 +53,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, + // Validate in case of configured output + if(output.total_size() > 0) + { +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)), + "Output can only be U8 if both inputs are U8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), +@@ -132,8 +138,40 @@ void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTen + build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position())); + } + ++ if (is_data_type_quantized_asymmetric(output->info()->data_type())) ++ { ++ const int left_shift = 20; ++ const double twice_max_input_scale = 2 * std::max(input1->info()->quantization_info().scale, input2->info()->quantization_info().scale); ++ const double real_input1_multiplier = input1->info()->quantization_info().scale / twice_max_input_scale; ++ const double real_input2_multiplier = input2->info()->quantization_info().scale / twice_max_input_scale; ++ const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output->info()->quantization_info().scale); ++ ++ int input1_multiplier, input2_multiplier, output_multiplier; ++ int input1_shift, input2_shift, output_shift; ++ quantization::calculate_quantized_multiplier_less_than_one(real_input1_multiplier, &input1_multiplier, &input1_shift); ++ quantization::calculate_quantized_multiplier_less_than_one(real_input2_multiplier, &input2_multiplier, &input2_shift); ++ quantization::calculate_quantized_multiplier_less_than_one(real_output_multiplier, &output_multiplier, &output_shift); ++ ++ build_opts.emplace("-DIN1_MULT_INT=" + support::cpp11::to_string(input1_multiplier)); ++ build_opts.emplace("-DIN2_MULT_INT=" + support::cpp11::to_string(input2_multiplier)); ++ build_opts.emplace("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_multiplier)); ++ build_opts.emplace("-DLEFT_SHIFT=" + support::cpp11::to_string(left_shift)); ++ build_opts.emplace("-DIN1_SHIFT=" + support::cpp11::to_string(input1_shift)); ++ build_opts.emplace("-DIN2_SHIFT=" + support::cpp11::to_string(input2_shift)); ++ build_opts.emplace("-DRESULT_SHIFT=" + support::cpp11::to_string(output_shift)); ++ build_opts.emplace("-DIN1_OFFSET=" + support::cpp11::to_string(-(input1->info()->quantization_info().offset))); ++ build_opts.emplace("-DIN2_OFFSET=" + support::cpp11::to_string(-(input2->info()->quantization_info().offset))); ++ build_opts.emplace("-DRESULT_OFFSET=" + support::cpp11::to_string(output->info()->quantization_info().offset)); ++ // TODO: Apply min-max BOUND to support fuse with relu. ++ } ++ + // Create kernel +- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts)); ++ std::string kernel_name = "arithmetic_add"; ++ if (is_data_type_quantized_asymmetric(output->info()->data_type())) ++ { ++ kernel_name += "_qasymm8"; ++ } ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + + ICLKernel::configure(win_config.second); + } +diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp +index 8308aa0..3053222 100644 +--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp ++++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp +@@ -1,5 +1,6 @@ + /* +- * Copyright (c) 2016, 2017 ARM Limited. ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * +@@ -24,37 +25,33 @@ + #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h" + + #include "arm_compute/core/CL/CLHelpers.h" +-#include "arm_compute/core/CL/CLKernelLibrary.h" + #include "arm_compute/core/CL/ICLTensor.h" +-#include "arm_compute/core/CL/OpenCL.h" +-#include "arm_compute/core/Helpers.h" +-#include "arm_compute/core/IAccessWindow.h" +-#include "arm_compute/core/TensorInfo.h" +-#include "arm_compute/core/Validate.h" +-#include "arm_compute/core/Window.h" +- +-#include <set> +-#include <string> + + using namespace arm_compute; + + namespace + { ++constexpr unsigned int num_elems_processed_per_iteration = 16; ++ + Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy) + { + ARM_COMPUTE_UNUSED(policy); ++ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); ++ ++ const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); ++ ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2); + + // Validate in case of configured output +- if((output != nullptr) && (output->total_size() != 0)) ++ if(output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); +- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output); + } + +@@ -63,17 +60,39 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + + std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) + { +- constexpr unsigned int num_elems_processed_per_iteration = 16; ++ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); ++ const TensorShape &out_shape = broadcast_pair.first; ++ const ValidRegion &valid_region = broadcast_pair.second; ++ ++ // Auto initialize output if not initialized ++ { ++ set_shape_if_empty(*output, out_shape); ++ ++ if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) ++ { ++ set_format_if_unknown(*output, Format::S16); ++ } ++ else if(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16) ++ { ++ set_format_if_unknown(*output, Format::F16); ++ } ++ else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) ++ { ++ set_format_if_unknown(*output, Format::F32); ++ } ++ } ++ ++ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); ++ Window win_input1 = win.broadcast_if_dimension_le_one(*input1); ++ Window win_input2 = win.broadcast_if_dimension_le_one(*input2); + +- Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + +- bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access); +- +- ValidRegion valid_region = intersect_valid_regions(input1->valid_region(), +- input2->valid_region()); ++ bool window_changed = update_window_and_padding(win_input1, input1_access) ++ || update_window_and_padding(win_input2, input2_access) ++ || update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + +@@ -90,28 +109,17 @@ CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel() + void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); +- +- // Auto initialize output if not initialized +- { +- set_shape_if_empty(*output->info(), input1->info()->tensor_shape()); +- +- if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16) +- { +- set_format_if_unknown(*output->info(), Format::S16); +- } +- else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) +- { +- set_format_if_unknown(*output->info(), Format::F32); +- } +- } +- + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy)); + ++ // Configure kernel window ++ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); ++ ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ++ + _input1 = input1; + _input2 = input2; + _output = output; + +- bool has_float_out = is_data_type_float(output->info()->data_type()); ++ const bool has_float_out = is_data_type_float(output->info()->data_type()); + + // Set kernel build options + std::set<std::string> build_opts; +@@ -127,14 +135,12 @@ void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICL + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts)); + +- // Configure kernel window +- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); +- ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); + } + + Status CLArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy) + { ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first); + +@@ -146,16 +152,49 @@ void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue & + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + +- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); +- Window slice = collapsed.first_slice_window_3D(); ++ const TensorShape &in_shape1 = _input1->info()->tensor_shape(); ++ const TensorShape &in_shape2 = _input2->info()->tensor_shape(); ++ const TensorShape &out_shape = _output->info()->tensor_shape(); ++ ++ bool can_collapse = true; ++ if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) ++ { ++ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); ++ for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) ++ { ++ can_collapse = (in_shape1[d] == in_shape2[d]); ++ } ++ } ++ ++ bool has_collapsed = false; ++ Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; ++ ++ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; ++ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; ++ ++ Window slice = collapsed.first_slice_window_3D(); ++ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); ++ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; +- add_3D_tensor_argument(idx, _input1, slice); +- add_3D_tensor_argument(idx, _input2, slice); ++ ++ add_3D_tensor_argument(idx, _input1, slice_input1); ++ add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); ++ + enqueue(queue, *this, slice); ++ ++ collapsed.slide_window_slice_3D(slice_input1); ++ collapsed.slide_window_slice_3D(slice_input2); + } + while(collapsed.slide_window_slice_3D(slice)); + } ++ ++BorderSize CLArithmeticSubtractionKernel::border_size() const ++{ ++ const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); ++ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); ++ return BorderSize(0, border, 0, 0); ++} +diff --git a/src/core/CL/kernels/CLCastKernel.cpp b/src/core/CL/kernels/CLCastKernel.cpp +new file mode 100644 +index 0000000..204ae74 +--- /dev/null ++++ b/src/core/CL/kernels/CLCastKernel.cpp +@@ -0,0 +1,115 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/core/CL/kernels/CLCastKernel.h" ++ ++#include "arm_compute/core/CL/CLHelpers.h" ++#include "arm_compute/core/CL/CLKernelLibrary.h" ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/Helpers.h" ++#include "arm_compute/core/IAccessWindow.h" ++#include "arm_compute/core/TensorInfo.h" ++#include "arm_compute/core/Utils.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/core/Window.h" ++ ++using namespace arm_compute; ++ ++CLCastKernel::CLCastKernel() ++ : _input(nullptr), _output(nullptr) ++{ ++} ++ ++void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ++ ++ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, ++ DataType::S16, DataType::S32, DataType::F16, DataType::F32); ++ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, ++ DataType::S16, DataType::S32, DataType::F16, DataType::F32); ++ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); ++ ++ _input = input; ++ _output = output; ++ ++ constexpr unsigned int num_elems_processed_per_iteration = 16; ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); ++ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); ++ build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); ++ ++ // Create kernel ++ if (is_data_type_quantized_asymmetric(input->info()->data_type())) ++ { ++ const float scale_in = input->info()->quantization_info().scale; ++ const int offset_in = input->info()->quantization_info().offset; ++ build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); ++ build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); ++ ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("cast_qasymm_in", build_opts)); ++ } ++ else if (is_data_type_quantized_asymmetric(output->info()->data_type())) ++ { ++ const float scale_in = output->info()->quantization_info().scale; ++ const int offset_in = output->info()->quantization_info().offset; ++ build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); ++ build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); ++ ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("cast_qasymm_out", build_opts)); ++ } ++ else ++ { ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("cast", build_opts)); ++ } ++ ++ // Configure kernel window ++ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); ++ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); ++ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); ++ update_window_and_padding(win, input_access, output_access); ++ output_access.set_valid_region(win, input->info()->valid_region()); ++ ++ ICLKernel::configure(win); ++} ++ ++void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); ++ ++ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); ++ Window slice = collapsed.first_slice_window_3D(); ++ ++ do ++ { ++ unsigned int idx = 0; ++ add_3D_tensor_argument(idx, _input, slice); ++ add_3D_tensor_argument(idx, _output, slice); ++ enqueue(queue, *this, slice); ++ } ++ while(collapsed.slide_window_slice_3D(slice)); ++} +diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp +new file mode 100644 +index 0000000..0a83008 +--- /dev/null ++++ b/src/core/CL/kernels/CLGatherKernel.cpp +@@ -0,0 +1,147 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/core/CL/kernels/CLGatherKernel.h" ++ ++#include "arm_compute/core/CL/CLHelpers.h" ++#include "arm_compute/core/CL/CLKernelLibrary.h" ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/OpenCL.h" ++#include "arm_compute/core/Error.h" ++#include "arm_compute/core/Helpers.h" ++#include "arm_compute/core/TensorInfo.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/core/Window.h" ++ ++#include <cmath> ++#include <cstdlib> ++#include <set> ++#include <string> ++ ++using namespace arm_compute; ++ ++namespace ++{ ++constexpr unsigned int num_elems_processed_per_iteration = 16; ++ ++Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) ++{ ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32, DataType::F32); ++ ++ return Status{}; ++} ++ ++} // namespace ++ ++CLGatherKernel::CLGatherKernel() ++ : _input1(nullptr), _input2(nullptr), _output(nullptr) ++{ ++} ++ ++void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ++ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); ++ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); ++ ++ _input1 = input1; ++ _input2 = input2; ++ _output = output; ++ ++ // Construct kernel name ++ std::string kernel_name = "gather"; ++ if (input1->info()->num_dimensions()==1) ++ { ++ kernel_name = "gather_1d"; ++ } ++ else if (input1->info()->num_dimensions()==2) ++ { ++ if(_output->info()->num_dimensions()==1) ++ { ++ kernel_name = "gather_1d_out"; ++ } ++ } ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); ++ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); ++ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); ++ ++ // Configure kernel window ++ const unsigned int num_elems_processed_per_iteration = 1; ++ Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration)); ++ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); ++ ++ ICLKernel::configure(win); ++} ++ ++Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ++ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output)); ++ ++ return Status{}; ++} ++ ++void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ if (_input1->info()->num_dimensions()==1) ++ { ++ Window slice = window.first_slice_window_1D(); ++ ++ unsigned int idx = 0; ++ add_1D_tensor_argument(idx, _input1, slice); ++ add_1D_tensor_argument(idx, _input2, slice); ++ add_1D_tensor_argument(idx, _output, slice); ++ enqueue(queue, *this, slice); ++ } ++ else if (_input1->info()->num_dimensions()==2) ++ { ++ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); ++ Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX); ++ ++ //Set inputs ++ unsigned int idx = 0; ++ add_2D_tensor_argument(idx, _input1, window_collapsed); ++ add_1D_tensor_argument(idx, _input2, slice); ++ if(_output->info()->num_dimensions()==1) ++ { ++ add_1D_tensor_argument(idx, _output, slice); ++ } ++ else ++ { ++ add_2D_tensor_argument(idx, _output, window_collapsed); ++ } ++ enqueue(queue, *this, slice); ++ } ++} +diff --git a/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp +new file mode 100644 +index 0000000..26cb3e2 +--- /dev/null ++++ b/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp +@@ -0,0 +1,284 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" ++ ++#include "arm_compute/core/CL/CLHelpers.h" ++#include "arm_compute/core/CL/CLKernelLibrary.h" ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/OpenCL.h" ++#include "arm_compute/core/Error.h" ++#include "arm_compute/core/Helpers.h" ++#include "arm_compute/core/TensorInfo.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/core/Window.h" ++ ++#include <cmath> ++#include <cstdlib> ++#include <set> ++#include <string> ++ ++using namespace arm_compute; ++ ++namespace ++{ ++constexpr unsigned int num_elems_processed_per_iteration = 16; ++ ++Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ++ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) ++{ ++ ARM_COMPUTE_UNUSED(overflow_policy); ++ ARM_COMPUTE_UNUSED(rounding_policy); ++ ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); ++ ++ const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); ++ ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2); ++ ++ if(is_data_type_fixed_point(input1->data_type())) ++ { ++ // All data types must be all QS8 or all QS16 ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1, "Unsupported scaling factor for QS8/QS16. Scale must be 1."); ++ } ++ ++ // Validate in case of configured output ++ if(output->total_size() > 0) ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), ++ "Output can only be U8 if both inputs are U8"); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output"); ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output); ++ if(is_data_type_fixed_point(input1->data_type())) ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); ++ } ++ } ++ ++ return Status{}; ++} ++ ++std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) ++{ ++ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); ++ const TensorShape &out_shape = broadcast_pair.first; ++ const ValidRegion &valid_region = broadcast_pair.second; ++ ++ // Auto initialize output if not initialized ++ { ++ set_shape_if_empty(*output, out_shape); ++ ++ if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) ++ { ++ set_format_if_unknown(*output, Format::S16); ++ } ++ else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) ++ { ++ set_format_if_unknown(*output, Format::F32); ++ } ++ } ++ ++ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); ++ Window win_input1 = win.broadcast_if_dimension_le_one(*input1); ++ Window win_input2 = win.broadcast_if_dimension_le_one(*input2); ++ ++ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); ++ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); ++ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); ++ ++ bool window_changed = update_window_and_padding(win_input1, input1_access) ++ || update_window_and_padding(win_input2, input2_access) ++ || update_window_and_padding(win, output_access); ++ ++ output_access.set_valid_region(win, valid_region); ++ ++ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; ++ return std::make_pair(err, win); ++} ++} // namespace ++ ++CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel() ++ : _input1(nullptr), _input2(nullptr), _output(nullptr) ++{ ++} ++ ++void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, ++ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ++ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), ++ scale, overflow_policy, rounding_policy)); ++ ++ // Configure kernel window ++ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); ++ ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ++ ++ _input1 = input1; ++ _input2 = input2; ++ _output = output; ++ ++ int scale_int = -1; ++ // Extract sign, exponent and mantissa ++ int exponent = 0; ++ float normalized_mantissa = std::frexp(scale, &exponent); ++ // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 ++ // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 ++ // Moreover, it will be negative as we deal with 1/2^n ++ if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) ++ { ++ // Store the positive exponent. We know that we compute 1/2^n ++ // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 ++ scale_int = std::abs(exponent - 1); ++ } ++ ++ std::string data_type; ++ std::string compute_type; ++ // Check if it has float inputs and output ++ if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type())) ++ { ++ scale_int = -1; ++ compute_type = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half"; ++ data_type = "DATA_TYPE_FLOAT"; ++ } ++ else ++ { ++ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16) ++ { ++ compute_type = "int"; ++ } ++ else if(input1->info()->data_type() == DataType::QS8) ++ { ++ compute_type = "qs8"; ++ } ++ else if(input1->info()->data_type() == DataType::QS16) ++ { ++ compute_type = "qs16"; ++ } ++ else ++ { ++ compute_type = "ushort"; ++ } ++ data_type = "DATA_TYPE_INT"; ++ } ++ ++ // Construct kernel name ++ std::string kernel_name = "pixelwise_div"; ++ kernel_name += (scale_int >= 0) ? "_int" : "_float"; ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE"); ++ build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte"); ++ if(is_data_type_fixed_point(input1->info()->data_type())) ++ { ++ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position())); ++ } ++ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); ++ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); ++ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); ++ build_opts.emplace("-DDATA_TYPE_RES=" + compute_type); ++ build_opts.emplace("-D" + data_type); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); ++ ++ // Set scale argument ++ unsigned int idx = 3 * num_arguments_per_3D_tensor(); //Skip the inputs and output parameters ++ ++ if(scale_int >= 0) ++ { ++ _kernel.setArg(idx++, scale_int); ++ } ++ else ++ { ++ _kernel.setArg(idx++, scale); ++ } ++ ++ ICLKernel::configure(win_config.second); ++} ++ ++Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ++ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ++ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy)); ++ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first); ++ ++ return Status{}; ++} ++ ++void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); ++ ++ const TensorShape &in_shape1 = _input1->info()->tensor_shape(); ++ const TensorShape &in_shape2 = _input2->info()->tensor_shape(); ++ const TensorShape &out_shape = _output->info()->tensor_shape(); ++ ++ bool can_collapse = true; ++ if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) ++ { ++ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); ++ for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) ++ { ++ can_collapse = (in_shape1[d] == in_shape2[d]); ++ } ++ } ++ ++ bool has_collapsed = false; ++ Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; ++ ++ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; ++ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; ++ ++ Window slice = collapsed.first_slice_window_3D(); ++ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); ++ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); ++ ++ do ++ { ++ unsigned int idx = 0; ++ add_3D_tensor_argument(idx, _input1, slice_input1); ++ add_3D_tensor_argument(idx, _input2, slice_input2); ++ add_3D_tensor_argument(idx, _output, slice); ++ enqueue(queue, *this, slice); ++ ++ collapsed.slide_window_slice_3D(slice_input1); ++ collapsed.slide_window_slice_3D(slice_input2); ++ } ++ while(collapsed.slide_window_slice_3D(slice)); ++} ++ ++BorderSize CLPixelWiseDivisionKernel::border_size() const ++{ ++ const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); ++ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); ++ return BorderSize(0, border, 0, 0); ++} +diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp +index f30ba61..8aa77ae 100644 +--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp ++++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp +@@ -32,6 +32,7 @@ + #include "arm_compute/core/TensorInfo.h" + #include "arm_compute/core/Validate.h" + #include "arm_compute/core/Window.h" ++#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + + #include <cmath> + #include <cstdlib> +@@ -50,8 +51,13 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + ARM_COMPUTE_UNUSED(overflow_policy); + ARM_COMPUTE_UNUSED(rounding_policy); + +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ if (is_data_type_quantized_asymmetric(output->data_type())) ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input1); ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input2); ++ } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); + + const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); +@@ -69,7 +75,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + // Validate in case of configured output + if(output->total_size() > 0) + { +- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output"); +@@ -188,7 +194,15 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I + + // Construct kernel name + std::string kernel_name = "pixelwise_mul"; +- kernel_name += (scale_int >= 0) ? "_int" : "_float"; ++ if (is_data_type_quantized_asymmetric(output->info()->data_type())) ++ { ++ compute_type = "qasymm8"; ++ kernel_name += "_qasymm8"; ++ } ++ else ++ { ++ kernel_name += (scale_int >= 0) ? "_int" : "_float"; ++ } + + // Set kernel build options + std::set<std::string> build_opts; +@@ -204,6 +218,21 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I + build_opts.emplace("-DDATA_TYPE_RES=" + compute_type); + build_opts.emplace("-D" + data_type); + ++ if (is_data_type_quantized_asymmetric(output->info()->data_type())) ++ { ++ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input1->info()->quantization_info() : output->info()->quantization_info(); ++ ++ float multiplier = input1->info()->quantization_info().scale * input2->info()->quantization_info().scale / output_quant_info.scale; ++ int output_multiplier, output_shift; ++ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); ++ ++ build_opts.emplace("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_multiplier)); ++ build_opts.emplace("-DRESULT_SHIFT=" + support::cpp11::to_string(output_shift)); ++ build_opts.emplace("-DIN1_OFFSET=" + support::cpp11::to_string(-(input1->info()->quantization_info().offset))); ++ build_opts.emplace("-DIN2_OFFSET=" + support::cpp11::to_string(-(input2->info()->quantization_info().offset))); ++ build_opts.emplace("-DRESULT_OFFSET=" + support::cpp11::to_string(output->info()->quantization_info().offset)); ++ // TODO: Apply min-max BOUND to support fuse with relu. ++ } + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + +diff --git a/src/core/CL/kernels/CLReduceMaxKernel.cpp b/src/core/CL/kernels/CLReduceMaxKernel.cpp +new file mode 100644 +index 0000000..cb1ee03 +--- /dev/null ++++ b/src/core/CL/kernels/CLReduceMaxKernel.cpp +@@ -0,0 +1,135 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" ++ ++#include "arm_compute/core/CL/CLHelpers.h" ++#include "arm_compute/core/CL/CLKernelLibrary.h" ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/OpenCL.h" ++#include "arm_compute/core/Error.h" ++#include "arm_compute/core/Helpers.h" ++#include "arm_compute/core/TensorInfo.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/core/Window.h" ++ ++#include <cmath> ++#include <cstdlib> ++#include <set> ++#include <string> ++ ++using namespace arm_compute; ++ ++namespace ++{ ++constexpr unsigned int num_elems_processed_per_iteration = 16; ++ ++Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) ++{ ++ // We can handle for simple case only ++ // Input rank: 2 ++ // Output rank: 1 ++ // Axis: one axis value, restrict to 1 ++ ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1"); ++ ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, "Inputs are not broadcast compatible"); ++ ++ // Validate in case of configured output ++ if(output->total_size() > 0) ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(), ++ "Output same type allowed for input and output"); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1, "Only support for output dimension 1"); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2, "Only support for input dimension 2"); ++ } ++ ++ return Status{}; ++} ++ ++} // namespace ++ ++CLReduceMaxKernel::CLReduceMaxKernel() ++ : _input(nullptr), _output(nullptr), _axis(0) ++{ ++} ++ ++void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ++ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info())); ++ ++ _input = input; ++ _output = output; ++ _axis = axis; ++ ++ // Configure kernel window ++ int cols = _input->info()->tensor_shape()[0]; ++ int rows = _input->info()->tensor_shape()[1]; ++ Window win; ++ win.set(0, Window::Dimension(0, cols, 1)); ++ win.set(1, Window::Dimension(0, rows, 1)); ++ ++ // Construct kernel name ++ std::string kernel_name = "reduce_max"; ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols)); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); ++ ++ ICLKernel::configure(win); ++} ++ ++Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ++ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output)); ++ ++ return Status{}; ++} ++ ++void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); ++ ++ Window window_input = window; ++ Window slice_input = window_input.first_slice_window_1D(); ++ ++ do ++ { ++ Window slice_output = slice_input.shift_dimensions(1); ++ unsigned int idx = 0; ++ add_1D_tensor_argument(idx, _input, slice_input); ++ add_1D_tensor_argument(idx, _output, slice_output); ++ enqueue(queue, *this, slice_input); ++ ++ } ++ while(window_input.slide_window_slice_1D(slice_input)); ++} +diff --git a/src/core/CL/kernels/CLReductionMeanKernel.cpp b/src/core/CL/kernels/CLReductionMeanKernel.cpp +new file mode 100644 +index 0000000..8e4dc38 +--- /dev/null ++++ b/src/core/CL/kernels/CLReductionMeanKernel.cpp +@@ -0,0 +1,190 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" ++ ++#include "arm_compute/core/AccessWindowStatic.h" ++#include "arm_compute/core/CL/CLHelpers.h" ++#include "arm_compute/core/CL/CLKernelLibrary.h" ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/FixedPoint.h" ++#include "arm_compute/core/Helpers.h" ++#include "arm_compute/core/TensorInfo.h" ++#include "arm_compute/core/Utils.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/core/Window.h" ++ ++#include "support/ToolchainSupport.h" ++ ++using namespace arm_compute; ++ ++namespace ++{ ++Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis) ++{ ++ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); ++ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); ++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); ++ ++ std::vector<uint32_t>::const_iterator it; ++ bool axis_w = false; ++ bool axis_h = false; ++ for(it=axis.begin(); it!=axis.end(); ++it){ ++ if((*it) == 0 ) ++ { ++ axis_w = true; ++ } ++ else if((*it) == 1 ) ++ { ++ axis_h = true; ++ } ++ else{ ++ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); ++ } ++ } ++ //TODO Other axises (currently, only axises for both width and height are supported.) ++ if( !axis_w || !axis_h) ++ { ++ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); ++ } ++ ++ if(output->total_size() != 0) ++ { ++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ++ ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW); ++ } ++ ++ return Status{}; ++} ++ ++std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, std::vector<uint32_t> axis) ++{ ++ // Output tensor auto initialization if not yet initialized ++ TensorShape output_shape{ input->tensor_shape() }; ++ output_shape.set(0, 1); ++ output_shape.set(1, 1); ++ auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(), input->fixed_point_position()); ++ ++ // Configure kernel window ++ constexpr unsigned int num_elems_processed_per_iteration_x = 8; //step ++ const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); ++ ++ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); ++ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); ++ AccessWindowHorizontal output_access(output, 0, 1); ++ bool window_changed = update_window_and_padding(win, input_access,output_access); ++ output_access.set_valid_region(win, output->valid_region()); ++ ++ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; ++ ++ return std::make_tuple(err, win); ++} ++} // namespace ++ ++CLReductionMeanKernel::CLReductionMeanKernel() ++ : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size() ++{ ++} ++ ++BorderSize CLReductionMeanKernel::border_size() const ++{ ++ return _border_size; ++} ++ ++void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ++ ++ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis)); ++ ++ _input = input; ++ _output = output; ++ _reduction_axis = axis; ++ ++ constexpr unsigned int num_elems_processed_per_iteration_x = 8; //step ++ ++ // Set border size ++ _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - input->info()->dimension(0)); ++ ++ // Set build options ++ std::set<std::string> build_opts; ++ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); ++ // build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); ++ if(is_data_type_fixed_point(input->info()->data_type())) ++ { ++ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())); ++ } ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_mean", build_opts)); ++ ++ // Configure kernel window ++ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis); ++ ++ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); ++ ++ ICLKernel::configure(std::get<1>(win_config)); ++} ++ ++Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis) ++{ ++ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis)); ++ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis))); ++ ++ return Status{}; ++} ++ ++void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ // Set out window ++ Window out_window(window); ++ out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); ++ ++ // Get first input and output slices ++ Window in_slice = window.first_slice_window_2D(); ++ Window out_slice = out_window.first_slice_window_2D(); ++ ++ // Set local sums buffer ++ // TODO work_group ++ unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size(); ++ ++ unsigned int idx = 2 * num_arguments_per_2D_tensor(); ++ _kernel.setArg(idx++, local_sum_size, nullptr); ++ _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1)));//height ++ _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0)*_input->info()->dimension(1)));//divider ++ ++ do ++ { ++ unsigned int idx = 0; ++ add_2D_tensor_argument(idx, _input, in_slice); ++ in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1)); ++ add_2D_tensor_argument(idx, _output, out_slice); ++ enqueue(queue, *this, in_slice); ++ } ++ while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); ++} +diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp +new file mode 100644 +index 0000000..b57cf20 +--- /dev/null ++++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp +@@ -0,0 +1,316 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" ++ ++#include "arm_compute/core/AccessWindowStatic.h" ++#include "arm_compute/core/CL/CLHelpers.h" ++#include "arm_compute/core/CL/CLKernelLibrary.h" ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/OpenCL.h" ++#include "arm_compute/core/Helpers.h" ++#include "arm_compute/core/IAccessWindow.h" ++#include "arm_compute/core/TensorInfo.h" ++#include "arm_compute/core/Utils.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/core/Window.h" ++ ++#include <string> ++ ++ ++using namespace std; ++using namespace arm_compute; ++ ++static const int32_t maxDim = 4; ++ ++CLStridedSliceKernel::CLStridedSliceKernel() ++ : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0) ++{ ++} ++ ++Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *begin, const ITensorInfo *end, const ITensorInfo *strides, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask) ++{ ++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides); ++ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, ++ DataType::U16, DataType::S16, DataType::QS16, ++ DataType::U32, DataType::S32, DataType::F16, DataType::F32); ++ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32); ++ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32); ++ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32); ++ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ++ ++ ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4); ++ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(), strides->tensor_shape()); ++ ++ return Status{}; ++} ++ ++// Return the index for the first element along that axis. This index will be a ++// positive integer between [0, axisSize - 1] that can be used to index ++// directly into the data. ++inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride, const TensorShape &inputShape, int32_t axis) ++{ ++ // Begin with the specified index ++ int32_t start = begin; ++ ++ // beginMask override ++ if (beginMask & 1 << axis) ++ { ++ if (stride > 0) ++ { ++ // Forward iteration - use the first element. These values will get ++ // clamped below (Note: We could have set them to 0 and axisSize-1, but ++ // use lowest() and max() to maintain symmetry with StopForAxis()) ++ start = std::numeric_limits<int32_t>::lowest(); ++ } ++ else ++ { ++ // Backward iteration - use the last element. ++ start = std::numeric_limits<int32_t>::max(); ++ } ++ } ++ ++ // Handle negative indices ++ int32_t axisSize = inputShape[axis]; ++ if (start < 0) ++ { ++ start += axisSize; ++ } ++ ++ // Clamping ++ start = arm_compute::utility::clamp(start, 0, axisSize - 1); ++ ++ return start; ++} ++ ++// Return the "real" index for the end of iteration along that axis. This is an ++// "end" in the traditional C sense, in that it points to one past the last ++// element. ie. So if you were iterating through all elements of a 1D array of ++// size 4, this function would return 4 as the stop, because it is one past the ++// "real" indices of 0, 1, 2 & 3. ++inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, const TensorShape &inputShape, int32_t axis) ++{ ++ // Begin with the specified index ++ int32_t stop = end; ++ ++ // endMask override ++ if (endMask & (1 << axis)) ++ { ++ if (stride > 0) ++ { ++ // Forward iteration - use the last element. These values will get ++ // clamped below ++ stop = std::numeric_limits<int32_t>::max(); ++ } ++ else ++ { ++ // Backward iteration - use the first element. ++ stop = std::numeric_limits<int32_t>::lowest(); ++ } ++ } ++ ++ // Handle negative indices ++ int32_t axisSize = inputShape[axis]; ++ if (stop < 0) { ++ stop += axisSize; ++ } ++ ++ // Clamping ++ // Because the end index points one past the last element, we need slightly ++ // different clamping ranges depending on the direction. ++ if (stride > 0) ++ { ++ // Forward iteration ++ stop = arm_compute::utility::clamp(stop, 0, axisSize); ++ } ++ else ++ { ++ // Backward iteration ++ stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); ++ } ++ ++ return stop; ++} ++ ++inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) ++{ ++ int32_t offset = b * shape[2] * shape[1] * shape[0]; ++ offset += d * shape[1] * shape[0]; ++ offset += h * shape[0]; ++ offset += w; ++ return offset; ++} ++ ++inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) ++{ ++ int32_t ret = 0; ++ if (stride > 0) ++ { ++ ret = ((stop - start - 1) / stride) + 1; ++ } ++ else ++ { ++ ret = ((stop - start + 1) / stride) + 1; ++ } ++ ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number"); ++ return ret; ++} ++ ++void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask) ++{ ++ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), beginMask, endMask, shrinkAxisMask)); ++ ++ _input = input; ++ _output = output; ++ _beginData = beginData; ++ _endData = endData; ++ _stridesData = stridesData; ++ _beginMask = beginMask; ++ _endMask = endMask; ++ _shrinkAxisMask = shrinkAxisMask; ++ ++ constexpr unsigned int num_elems_processed_per_iteration = 1; ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-DELEMENT_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); ++ build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size())); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("strided_slice", build_opts)); ++ ++ // Create output's window without padding ++ TensorShape collapsed = output->info()->tensor_shape(); ++ collapsed.collapse(4); ++ TensorInfo info = *output->info(); ++ info.set_tensor_shape(collapsed); ++ Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration)); ++ ++ ICLKernel::configure(win); ++} ++ ++void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ // Create input window ++ TensorShape collapsed = _input->info()->tensor_shape(); ++ collapsed.collapse(4); ++ TensorInfo info = *_input->info(); ++ info.set_tensor_shape(collapsed); ++ Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size())); ++ ++ _beginData->map(queue); ++ _endData->map(queue); ++ _stridesData->map(queue); ++ ++ std::vector<int32_t> dimsIn; ++ std::vector<int32_t> dimsOut; ++ std::vector<int32_t> starts; ++ std::vector<int32_t> stops; ++ std::vector<int32_t> strides; ++ ++ for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n) ++ { ++ const TensorShape shape = _input->info()->tensor_shape(); ++ starts.emplace_back(StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n], ++ reinterpret_cast<int32_t *>(_stridesData->buffer())[n], ++ shape, n)); ++ ++ stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n], ++ reinterpret_cast<int32_t *>(_stridesData->buffer())[n], ++ shape, n)); ++ ++ strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]); ++ dimsIn.emplace_back(shape[n]); ++ dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n])); ++ } ++ ++ for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) { ++ starts.emplace_back(0); ++ stops.emplace_back(1); ++ strides.emplace_back(1); ++ dimsIn.emplace_back(1); ++ dimsOut.emplace_back(1); ++ } ++ // TODO: Apply shrinkAxisMask ++ ++ _beginData->unmap(queue); ++ _stridesData->unmap(queue); ++ _endData->unmap(queue); ++ ++ // Set parameters ++ unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters ++ const cl_int4 dimsInArg = ++ { ++ { ++ static_cast<cl_int>(dimsIn[0]), ++ static_cast<cl_int>(dimsIn[1]), ++ static_cast<cl_int>(dimsIn[2]), ++ static_cast<cl_int>(dimsIn[3]), ++ } ++ }; ++ _kernel.setArg<cl_int4>(idx++, dimsInArg); ++ ++ const cl_int4 dimsOutArg = ++ { ++ { ++ static_cast<cl_int>(dimsOut[0]), ++ static_cast<cl_int>(dimsOut[1]), ++ static_cast<cl_int>(dimsOut[2]), ++ static_cast<cl_int>(dimsOut[3]), ++ } ++ }; ++ _kernel.setArg<cl_int4>(idx++, dimsOutArg); ++ ++ const cl_int4 startsArg = ++ { ++ { ++ static_cast<cl_int>(starts[0]), ++ static_cast<cl_int>(starts[1]), ++ static_cast<cl_int>(starts[2]), ++ static_cast<cl_int>(starts[3]), ++ } ++ }; ++ _kernel.setArg<cl_int4>(idx++, startsArg); ++ ++ const cl_int4 stridesArg = ++ { ++ { ++ static_cast<cl_int>(strides[0]), ++ static_cast<cl_int>(strides[1]), ++ static_cast<cl_int>(strides[2]), ++ static_cast<cl_int>(strides[3]), ++ } ++ }; ++ _kernel.setArg<cl_int4>(idx++, stridesArg); ++ ++ // TODO: Apply slicing output's window ++ idx = 0; ++ add_1D_tensor_argument(idx, _input, win_in); ++ add_1D_tensor_argument(idx, _output, window); ++ ++ enqueue(queue, *this, window); ++} +diff --git a/src/core/CL/kernels/CLTopKV2Kernel.cpp b/src/core/CL/kernels/CLTopKV2Kernel.cpp +new file mode 100644 +index 0000000..08cc6bc +--- /dev/null ++++ b/src/core/CL/kernels/CLTopKV2Kernel.cpp +@@ -0,0 +1,479 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" ++ ++#include "arm_compute/core/CL/CLHelpers.h" ++#include "arm_compute/core/CL/CLKernelLibrary.h" ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/Helpers.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/core/Window.h" ++ ++#include <climits> ++#include <cassert> ++ ++namespace arm_compute ++{ ++//////////////////////////////////////////////////////////////////////////////// ++CLTopKV2Single::CLTopKV2Single() ++ : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) ++{} ++ ++void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ++ ICLTensor *topk_indices, cl::Buffer *indices, ++ cl::Buffer *temp_stack, int k, int n) ++{ ++ ARM_COMPUTE_ERROR_ON(input == nullptr && indices== nullptr); ++ ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); ++ ARM_COMPUTE_ERROR_ON(n == 0); ++ ++ _input = input; ++ _topk_values = topk_values; ++ _topk_indices = topk_indices; ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_quicksort", build_opts)); ++ ++ unsigned int idx = 3*num_arguments_per_1D_tensor(); ++ _kernel.setArg(idx++, *indices); ++ _kernel.setArg(idx++, *temp_stack); ++ _kernel.setArg<cl_int>(idx++, k); ++ _kernel.setArg<cl_int>(idx++, n); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, 1, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ unsigned int idx = 0; ++ add_1D_tensor_argument(idx, _input, window); ++ add_1D_tensor_argument(idx, _topk_values, window); ++ add_1D_tensor_argument(idx, _topk_indices, window); ++ ++ enqueue(queue, *this, window); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++CLTopKV2Init::CLTopKV2Init() ++ : _input(nullptr) ++{} ++ ++void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer* in_key_buf, ++ cl::Buffer* in_ind_buf, int n) ++{ ++ ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); ++ ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); ++ ARM_COMPUTE_ERROR_ON(n == 0); ++ ++ _input = input; ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_init", build_opts)); ++ ++ unsigned int idx = num_arguments_per_1D_tensor(); ++ _kernel.setArg(idx++, *in_key_buf); ++ _kernel.setArg(idx++, *in_ind_buf); ++ _kernel.setArg<cl_int>(idx++, n); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, n, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ unsigned int idx = 0; ++ add_1D_tensor_argument(idx, _input, window); ++ ++ enqueue(queue, *this, window); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++// This kernel makes a histogram of radix for each work item. ++CLRadixSortHistogram::CLRadixSortHistogram() ++: _pass(0), _in_key_buf(nullptr) ++{} ++ ++void CLRadixSortHistogram::configure(cl::Buffer* hist_buf, int bits, int n) ++{ ++ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); ++ ++ unsigned int radix = 1 << bits; ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); ++ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); ++ build_opts.emplace("-DPERMUT=1"); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_histogram", build_opts)); ++ ++ int loc_histo_size = radix * _ITEMS * sizeof(cl_int); ++ ++ unsigned int idx = 1; ++ _kernel.setArg(idx++, *hist_buf); ++ ++ idx = 3; ++ _kernel.setArg(idx++, loc_histo_size, nullptr); ++ _kernel.setArg<cl_int>(idx++, n); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, _GROUPS*_ITEMS, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ _kernel.setArg(0, *_in_key_buf); ++ _kernel.setArg<cl_int>(2, _pass); ++ ++ cl::NDRange lws = cl::NDRange(_ITEMS, 1); ++ ++ enqueue(queue, *this, window, lws); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++CLRadixSortScanHistogram::CLRadixSortScanHistogram() ++{} ++ ++void CLRadixSortScanHistogram::configure(cl::Buffer* hist_buf, cl::Buffer* glob_sum_buf, int bits) ++{ ++ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); ++ ++ unsigned int radix = 1 << bits; ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); ++ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); ++ build_opts.emplace("-DPERMUT=1"); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_scanhistograms", build_opts)); ++ ++ int temp_size = std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); ++ ++ unsigned int idx = 0; ++ _kernel.setArg(idx++, *hist_buf); ++ _kernel.setArg(idx++, temp_size, nullptr); ++ _kernel.setArg(idx++, *glob_sum_buf); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS/2, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); ++ cl::NDRange lws = cl::NDRange(gws_x/_HISTOSPLIT, 1); ++ ++ enqueue(queue, *this, window, lws); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() ++{} ++ ++void CLRadixSortGlobalScanHistogram::configure(cl::Buffer* glob_sum_buf, cl::Buffer* temp_buf, int bits) ++{ ++ ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); ++ ++ unsigned int radix = 1 << bits; ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); ++ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); ++ build_opts.emplace("-DPERMUT=1"); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_scanhistograms", build_opts)); ++ ++ int temp_size = std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); ++ ++ unsigned int idx = 0; ++ _kernel.setArg(idx++, *glob_sum_buf); ++ _kernel.setArg(idx++, temp_size, nullptr); ++ _kernel.setArg(idx++, *temp_buf); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, _HISTOSPLIT/2, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); ++ cl::NDRange lws = cl::NDRange(gws_x, 1); ++ ++ enqueue(queue, *this, window, lws); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() ++{} ++ ++void CLRadixSortPasteHistogram::configure(cl::Buffer* hist_buf, cl::Buffer* glob_sum_buf, int bits) ++{ ++ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); ++ ++ unsigned int radix = 1 << bits; ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); ++ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); ++ build_opts.emplace("-DPERMUT=1"); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_pastehistograms", build_opts)); ++ ++ unsigned int idx = 0; ++ _kernel.setArg(idx++, *hist_buf); ++ _kernel.setArg(idx++, *glob_sum_buf); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); ++ cl::NDRange lws = cl::NDRange(gws_x/_HISTOSPLIT, 1); ++ ++ enqueue(queue, *this, window, lws); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++CLRadixSortReorder::CLRadixSortReorder() ++: _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), ++ _in_ind_buf(nullptr), _out_ind_buf(nullptr) ++{} ++ ++void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) ++{ ++ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); ++ ARM_COMPUTE_ERROR_ON(n == 0); ++ ++ unsigned int radix = 1 << bits; ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); ++ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); ++ build_opts.emplace("-DPERMUT=1"); ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_reorder", build_opts)); ++ ++ unsigned int idx = 2; ++ _kernel.setArg(idx++, *hist_buf); ++ ++ idx = 6; ++ _kernel.setArg(idx++, sizeof(uint)* radix * _ITEMS, nullptr); ++ _kernel.setArg<cl_int>(idx++, n); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); ++ unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); ++ cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); ++ ++ _kernel.setArg(0, *_in_key_buf); ++ _kernel.setArg(1, *_out_key_buf); ++ _kernel.setArg<cl_int>(3, _pass); ++ _kernel.setArg(4, *_in_ind_buf); ++ _kernel.setArg(5, *_out_ind_buf); ++ ++ enqueue(queue, *this, window, lws); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() ++: _out_key_buf(nullptr) ++{} ++ ++void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) ++{ ++ ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); ++ ARM_COMPUTE_ERROR_ON(n == 0); ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_find_first_negative", build_opts)); ++ ++ unsigned int idx = 1; ++ _kernel.setArg(idx++, *first_negative_idx_buf); ++ _kernel.setArg<cl_int>(idx++, n); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, n, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ unsigned int idx = 0; ++ _kernel.setArg(idx++, *_out_key_buf); ++ ++ enqueue(queue, *this, window); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() ++: _in_key_buf(nullptr), _out_key_buf(nullptr), ++ _in_ind_buf(nullptr), _out_ind_buf(nullptr) ++{} ++ ++void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) ++{ ++ ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); ++ ARM_COMPUTE_ERROR_ON(n == 0); ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_reorder_negatives", build_opts)); ++ ++ unsigned int idx = 4; ++ _kernel.setArg(idx++, *first_negative_idx_buf); ++ _kernel.setArg<cl_int>(idx++, n); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, n, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ unsigned int idx = 0; ++ _kernel.setArg(idx++, *_in_key_buf); ++ _kernel.setArg(idx++, *_out_key_buf); ++ _kernel.setArg(idx++, *_in_ind_buf); ++ _kernel.setArg(idx++, *_out_ind_buf); ++ ++ enqueue(queue, *this, window); ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++CLTopKV2Store::CLTopKV2Store() ++: _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) ++{} ++ ++void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) ++{ ++ ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); ++ ARM_COMPUTE_ERROR_ON(k == 0); ++ ARM_COMPUTE_ERROR_ON(k > n); ++ ++ _values = values; ++ _indices = indices; ++ ++ // Set kernel build options ++ std::set<std::string> build_opts; ++ ++ // Create kernel ++ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_store", build_opts)); ++ ++ unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; ++ _kernel.setArg<cl_int>(idx++, n); ++ ++ // Configure kernel window ++ Window win; ++ win.set(0, Window::Dimension(0, k, 1)); ++ ICLKernel::configure(win); ++} ++ ++void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) ++{ ++ _out_key_buf = out_key_buf; ++ _out_ind_buf = out_ind_buf; ++} ++ ++void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) ++{ ++ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ++ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ++ ++ unsigned int idx = 0; ++ add_1D_tensor_argument(idx, _values, window); ++ add_1D_tensor_argument(idx, _indices, window); ++ _kernel.setArg(idx++, *_out_key_buf); ++ _kernel.setArg(idx++, *_out_ind_buf); ++ ++ enqueue(queue, *this, window); ++} ++ ++} // namespace arm_compute +diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp +index d4fabd4..d0374e7 100644 +--- a/src/core/Validate.cpp ++++ b/src/core/Validate.cpp +@@ -169,7 +169,7 @@ arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function + // Subtensor should not index in x, y dimensions. + ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords.x() != 0) || (coords.y() != 0)), function, file, line); + // Subtensor shape should match parent tensor in x, y dimensions. +- ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) || (parent_shape.y() != parent_shape.y())), function, file, line); ++ ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) || (parent_shape.y() != shape.y())), function, file, line); + + // Check dimensions + for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) +diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp +index 5fca30c..3c13e3d 100644 +--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp ++++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT +@@ -23,6 +24,7 @@ + */ + #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h" + ++#include "arm_compute/core/CL/ICLTensor.h" + #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h" + #include "support/ToolchainSupport.h" + +@@ -30,11 +32,21 @@ + + using namespace arm_compute; + +-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy) ++void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy) + { + auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>(); + k->configure(input1, input2, output, policy); + _kernel = std::move(k); ++ ++ if(output->info()->dimension(0) > 1) ++ { ++ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; ++ ++ if(broadcasted_info->info()->dimension(0) == 1) ++ { ++ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); ++ } ++ } + } + + Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy) +diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp +new file mode 100644 +index 0000000..4669577 +--- /dev/null ++++ b/src/runtime/CL/functions/CLCast.cpp +@@ -0,0 +1,37 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/runtime/CL/functions/CLCast.h" ++ ++#include "arm_compute/core/CL/kernels/CLCastKernel.h" ++#include "support/ToolchainSupport.h" ++ ++using namespace arm_compute; ++ ++void CLCast::configure(ICLTensor *input, ICLTensor *output) ++{ ++ auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>(); ++ k->configure(input, output); ++ _kernel = std::move(k); ++} +diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp +new file mode 100644 +index 0000000..3f2f2c1 +--- /dev/null ++++ b/src/runtime/CL/functions/CLGather.cpp +@@ -0,0 +1,45 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/runtime/CL/functions/CLGather.h" ++ ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/kernels/CLGatherKernel.h" ++#include "support/ToolchainSupport.h" ++ ++#include <utility> ++ ++using namespace arm_compute; ++ ++void CLGather::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) ++{ ++ auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>(); ++ k->configure(input1, input2, output); ++ _kernel = std::move(k); ++} ++ ++Status CLGather::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) ++{ ++ return CLGatherKernel::validate(input1, input2, output); ++} +diff --git a/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/src/runtime/CL/functions/CLPixelWiseDivision.cpp +new file mode 100644 +index 0000000..343e944 +--- /dev/null ++++ b/src/runtime/CL/functions/CLPixelWiseDivision.cpp +@@ -0,0 +1,57 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2016-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h" ++ ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" ++#include "support/ToolchainSupport.h" ++ ++#include <utility> ++ ++using namespace arm_compute; ++ ++void CLPixelWiseDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, ++ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) ++{ ++ auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseDivisionKernel>(); ++ k->configure(input1, input2, output, scale, overflow_policy, rounding_policy); ++ _kernel = std::move(k); ++ ++ if(output->info()->dimension(0) > 1) ++ { ++ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; ++ ++ if(broadcasted_info->info()->dimension(0) == 1) ++ { ++ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); ++ } ++ } ++} ++ ++Status CLPixelWiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ++ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) ++{ ++ return CLPixelWiseDivisionKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy); ++} +diff --git a/src/runtime/CL/functions/CLReduceMax.cpp b/src/runtime/CL/functions/CLReduceMax.cpp +new file mode 100644 +index 0000000..276ffd2 +--- /dev/null ++++ b/src/runtime/CL/functions/CLReduceMax.cpp +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/runtime/CL/functions/CLReduceMax.h" ++ ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "support/ToolchainSupport.h" ++#include "arm_compute/core/CL/CLHelpers.h" ++#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" ++ ++#include <vector> ++#include <algorithm> ++ ++#include <utility> ++ ++#define REDUCE_MAX_RUN_ON_CPU 1 ++ ++namespace arm_compute ++{ ++ ++CLReduceMax::CLReduceMax() ++: _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) ++{ ++} ++ ++void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output) ++{ ++ _axis = axis; ++ ++ _input = input; ++ _output = output; ++ ++ auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>(); ++ k->configure(input, axis, output); ++ _kernel = std::move(k); ++ ++ // We can handle for simple case only ++ // Output rank: 1 ++ // Axis: one axis value, restrict to 1 ++ ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2); ++ ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1); ++ ARM_COMPUTE_ERROR_ON(axis != 1); ++} ++ ++Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) ++{ ++ return CLReduceMaxKernel::validate(input, axis, output); ++} ++ ++void CLReduceMax::run() ++{ ++#if REDUCE_MAX_RUN_ON_CPU ++ run_on_cpu(); ++ ++ arm_compute::CLScheduler::get().sync(); ++#else ++ arm_compute::CLScheduler::get().enqueue(*_kernel); ++#endif ++} ++ ++void CLReduceMax::run_on_cpu() ++{ ++ cl::CommandQueue q = CLScheduler::get().queue(); ++ ++ _input->map(q); ++ _output->map(q); ++ ++ // Compute by CPU for simple case ++ // Input rank: 2 ++ // Output rank: 1 ++ // Axis: one axis value, restrict to 1 ++ ++ float* input_data = (float*)_input->buffer(); ++ float* output_data = (float*)_output->buffer(); ++ ++ std::vector<float> container_max; ++ int cols = _input->info()->tensor_shape()[0]; ++ int rows = _input->info()->tensor_shape()[1]; ++ container_max.resize(rows); ++ ++ // Initialize as 1st element in row ++ float* input_pointer = input_data; ++ for (int i = 0; i < rows; i++) ++ { ++ container_max[i] = *input_pointer; ++ input_pointer += cols; ++ } ++ ++ // Update max value in row ++ for (int i = 0; i < rows; i++) ++ { ++ float max_in_row = container_max[i]; ++ for (int j = 1; j < cols; j++) ++ { ++ if (max_in_row < input_data[i * cols + j]) ++ { ++ max_in_row = input_data[i * cols + j]; ++ } ++ } ++ container_max[i] = max_in_row; ++ } ++ ++ for (int i = 0; i < rows; i++) ++ { ++ output_data[i] = container_max[i]; ++ } ++ ++ _input->unmap(q); ++ _output->unmap(q); ++} ++} // namespace arm_compute +diff --git a/src/runtime/CL/functions/CLReductionMean.cpp b/src/runtime/CL/functions/CLReductionMean.cpp +new file mode 100644 +index 0000000..4f71e84 +--- /dev/null ++++ b/src/runtime/CL/functions/CLReductionMean.cpp +@@ -0,0 +1,60 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017-2018 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/runtime/CL/functions/CLReductionMean.h" ++ ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" ++#include "arm_compute/core/Error.h" ++#include "arm_compute/core/PixelValue.h" ++#include "arm_compute/core/TensorInfo.h" ++#include "arm_compute/core/Validate.h" ++#include "arm_compute/runtime/CL/CLScheduler.h" ++#include "arm_compute/runtime/Tensor.h" ++#include "support/ToolchainSupport.h" ++ ++using namespace arm_compute; ++ ++CLReductionMean::CLReductionMean() ++ : _reduction_mean_kernel(), _fill_border_kernel() ++{ ++} ++ ++Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis) ++{ ++ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis)); ++ return Status{}; ++} ++ ++void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis) ++{ ++ _reduction_mean_kernel.configure(input, output, axis); ++ _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0)); ++} ++ ++void CLReductionMean::run() ++{ ++ CLScheduler::get().enqueue(_fill_border_kernel); ++ CLScheduler::get().enqueue(_reduction_mean_kernel); ++} +diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp +new file mode 100644 +index 0000000..2695fc6 +--- /dev/null ++++ b/src/runtime/CL/functions/CLStridedSlice.cpp +@@ -0,0 +1,288 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/runtime/CL/functions/CLStridedSlice.h" ++ ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" ++#include "arm_compute/core/utils/misc/Utility.h" ++#include "arm_compute/runtime/CL/CLScheduler.h" ++#include "support/ToolchainSupport.h" ++#include <vector> ++ ++using namespace arm_compute; ++ ++static const int32_t maxDims = 4; ++ ++// Return the index for the first element along that axis. This index will be a ++// positive integer between [0, axisSize - 1] that can be used to index ++// directly into the data. ++inline int32_t StartForAxis(int32_t beginMask, ++ std::vector<int32_t> const &startIndices, ++ std::vector<int32_t> const &strides, ++ const TensorShape &inputShape, int32_t axis) ++{ ++ // Begin with the specified index ++ int32_t start = startIndices[axis]; ++ ++ // beginMask override ++ if (beginMask & 1 << axis) ++ { ++ if (strides[axis] > 0) ++ { ++ // Forward iteration - use the first element. These values will get ++ // clamped below (Note: We could have set them to 0 and axisSize-1, but ++ // use lowest() and max() to maintain symmetry with StopForAxis()) ++ start = std::numeric_limits<int32_t>::lowest(); ++ } ++ else ++ { ++ // Backward iteration - use the last element. ++ start = std::numeric_limits<int32_t>::max(); ++ } ++ } ++ ++ // Handle negative indices ++ int32_t axisSize = inputShape[axis]; ++ if (start < 0) ++ { ++ start += axisSize; ++ } ++ ++ // Clamping ++ start = arm_compute::utility::clamp(start, 0, axisSize - 1); ++ ++ return start; ++} ++ ++// Return the "real" index for the end of iteration along that axis. This is an ++// "end" in the traditional C sense, in that it points to one past the last ++// element. ie. So if you were iterating through all elements of a 1D array of ++// size 4, this function would return 4 as the stop, because it is one past the ++// "real" indices of 0, 1, 2 & 3. ++inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices, ++ std::vector<int32_t> const &strides, ++ const TensorShape &inputShape, int32_t axis) ++{ ++ // Begin with the specified index ++ int32_t stop = stopIndices[axis]; ++ ++ // endMask override ++ if (endMask & (1 << axis)) ++ { ++ if (strides[axis] > 0) ++ { ++ // Forward iteration - use the last element. These values will get ++ // clamped below ++ stop = std::numeric_limits<int32_t>::max(); ++ } ++ else ++ { ++ // Backward iteration - use the first element. ++ stop = std::numeric_limits<int32_t>::lowest(); ++ } ++ } ++ ++ // Handle negative indices ++ int32_t axisSize = inputShape[axis]; ++ if (stop < 0) { ++ stop += axisSize; ++ } ++ ++ // Clamping ++ // Because the end index points one past the last element, we need slightly ++ // different clamping ranges depending on the direction. ++ if (strides[axis] > 0) ++ { ++ // Forward iteration ++ stop = arm_compute::utility::clamp(stop, 0, axisSize); ++ } ++ else ++ { ++ // Backward iteration ++ stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); ++ } ++ ++ return stop; ++} ++ ++inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) ++{ ++ int32_t offset = b * shape[2] * shape[1] * shape[0]; ++ offset += d * shape[1] * shape[0]; ++ offset += h * shape[0]; ++ offset += w; ++ return offset; ++} ++ ++void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask) ++{ ++ auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>(); ++ k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask); ++ _kernel = std::move(k); ++} ++ ++void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask) ++{ ++ ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate(input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), beginMask, endMask, shrinkAxisMask)); ++ ++ _input = input; ++ _output = output; ++ _beginData = beginData; ++ _endData = endData; ++ _stridesData = stridesData; ++ _beginMask = beginMask; ++ _endMask = endMask; ++ _shrinkAxisMask = shrinkAxisMask; ++} ++ ++void CLStridedSliceCPU::run() ++{ ++ run_on_cpu(); ++ ++ arm_compute::CLScheduler::get().sync(); ++} ++ ++inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) ++{ ++ if (stride > 0) ++ { ++ return ((stop - start - 1) / stride) + 1; ++ } ++ else ++ { ++ return ((stop - start + 1) / stride) + 1; ++ } ++} ++ ++template <typename T> ++inline void StridedSlice(const T *inputData, const TensorShape &inputShape, ++ int32_t beginMask, int32_t endMask, ++ const std::vector<int32_t> &startIndices, ++ const std::vector<int32_t> &stopIndices, ++ const std::vector<int32_t> &strides, T *outputData) ++{ ++ ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims); ++ ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims); ++ ARM_COMPUTE_ERROR_ON(strides.size() != maxDims); ++ ++ const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3); ++ const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3); ++ const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2); ++ const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2); ++ const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1); ++ const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1); ++ const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0); ++ const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0); ++ ++ // The shape of outputData may collapse in one-dimension. ++ // Therefore, it is necessary to create a shape that matches the result of the outputData. ++ TensorShape outputShape(getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]), ++ getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3])); ++ for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b; in_b += strides[3], b++) ++ { ++ for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d; in_d += strides[2], d++) ++ { ++ for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h; in_h += strides[1], h++) ++ { ++ for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w; in_w += strides[0], w++) ++ { ++ outputData[offset4D(outputShape, b, d, h, w)] = inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)]; ++ } ++ } ++ } ++ } ++} ++ ++void CLStridedSliceCPU::run_on_cpu() ++{ ++ // TODO: Support shrinkAxisMask ++ cl::CommandQueue q = CLScheduler::get().queue(); ++ ++ _input->map(q); ++ _output->map(q); ++ _beginData->map(q); ++ _endData->map(q); ++ _stridesData->map(q); ++ ++ TensorShape inputShape = _input->info()->tensor_shape(); ++ TensorShape outputShape = _output->info()->tensor_shape(); ++ ++ std::vector<int32_t> starts; ++ std::vector<int32_t> stops; ++ std::vector<int32_t> strides; ++ ++ for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx) { ++ starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]); ++ stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]); ++ strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]); ++ } ++ ++ for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++) { ++ starts.emplace_back(0); ++ stops.emplace_back(1); ++ strides.emplace_back(1); ++ } ++ ++ switch (_input->info()->data_type()) ++ { ++ case DataType::U8: ++ case DataType::QASYMM8: ++ StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<uint8_t *>(_output->buffer())); ++ break; ++ case DataType::S8: ++ case DataType::QS8: ++ StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer())); ++ break; ++ case DataType::U16: ++ StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<uint16_t *>(_output->buffer())); ++ break; ++ case DataType::S16: ++ case DataType::QS16: ++ StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<int16_t *>(_output->buffer())); ++ break; ++ case DataType::F16: ++ // Not sure this works. ++ StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer())); ++ break; ++ case DataType::U32: ++ StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<uint32_t *>(_output->buffer())); ++ break; ++ case DataType::S32: ++ StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<int32_t *>(_output->buffer())); ++ break; ++ case DataType::F32: ++ StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer())); ++ break; ++ default: ++ ARM_COMPUTE_ERROR("DataType not supported"); ++ break; ++ } ++ ++ _input->unmap(q); ++ _output->unmap(q); ++ _beginData->unmap(q); ++ _endData->unmap(q); ++ _stridesData->unmap(q); ++} +diff --git a/src/runtime/CL/functions/CLTopKV2.cpp b/src/runtime/CL/functions/CLTopKV2.cpp +new file mode 100644 +index 0000000..ed9797e +--- /dev/null ++++ b/src/runtime/CL/functions/CLTopKV2.cpp +@@ -0,0 +1,310 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (c) 2017 ARM Limited. ++ * ++ * SPDX-License-Identifier: MIT ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in all ++ * copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include "arm_compute/runtime/CL/functions/CLTopKV2.h" ++ ++#include "arm_compute/core/CL/ICLTensor.h" ++#include "arm_compute/core/CL/CLHelpers.h" ++ ++#include <vector> ++#include <algorithm> ++ ++#include "../../topk_v2.h" ++ ++namespace arm_compute ++{ ++ ++CLTopKV2::CLTopKV2() ++: _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), ++ _glob_sum_buf_size(0), _n(0), _input(nullptr), ++ _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), ++ _hist_buf(), _glob_sum_buf(), _temp_buf(), ++ _first_negative_idx_buf(), _in_key_buf(), _out_key_buf(), _in_ind_buf(), _out_ind_buf(), ++ _p_in_key_buf(nullptr), _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr), ++ _qs_kernel(), ++ _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), _paste_hist_kernel(), ++ _reorder_kernel(), _find_first_negative_kernel(), _reorder_negatives_kernel(),_store_kernel() ++{ ++} ++ ++void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, ++ int total_bits, int bits) ++{ ++ _total_bits = total_bits; ++ _bits = bits; ++ _n = input->info()->tensor_shape()[0]; ++ ++ // _total_bits should be divided by _bits. ++ ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0); ++ ++ _k = k; ++ _radix = 1 << bits; ++ ++ _input = input; ++ _values = values; ++ _indices = indices; ++ ++ std::string topk_env; ++ ++ char* env = getenv("ACL_TOPKV2"); ++ if( env ) ++ topk_env = env; ++ ++ if(topk_env == "GPU_SINGLE") ++ { ++ _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_int) * _n); ++ _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_int) * _n); ++ ++ _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n); ++ } ++ else if(topk_env == "GPU") ++ { ++ // n should be divided by (_GROUPS * _ITEMS) ++ ARM_COMPUTE_ERROR_ON((_n % (_GROUPS *_ITEMS)) != 0); ++ ++ _hist_buf_size = _radix * _GROUPS * _ITEMS; ++ _glob_sum_buf_size = _HISTOSPLIT; ++ ++ _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_int) * _hist_buf_size); ++ _glob_sum_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_int) * _glob_sum_buf_size); ++ _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_int) * _glob_sum_buf_size); ++ _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_int)); ++ _in_key_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_float) * _n); ++ _out_key_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_float) * _n); ++ _in_ind_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_int) * _n); ++ _out_ind_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, ++ sizeof(cl_int) * _n); ++ ++ _p_in_key_buf = &_in_key_buf; ++ _p_out_key_buf = &_out_key_buf; ++ _p_in_ind_buf = &_in_ind_buf; ++ _p_out_ind_buf = &_out_ind_buf; ++ ++ _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n); ++ _hist_kernel.configure(&_hist_buf, bits, _n); ++ _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); ++ _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits); ++ _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); ++ _reorder_kernel.configure(&_hist_buf, bits, _n); ++ _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n); ++ _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n); ++ _store_kernel.configure(values, indices, k, _n); ++ } ++ else ++ { ++ // DO NOTHING for CPU. ++ } ++} ++ ++void CLTopKV2::run() ++{ ++ std::string topk_env; ++ ++ char* env = getenv("ACL_TOPKV2"); ++ if( env ) ++ topk_env = env; ++ ++ if(topk_env == "GPU_SINGLE") ++ { ++ run_on_gpu_single_quicksort(); ++ } ++ else if(topk_env == "GPU") ++ { ++ run_on_gpu(); ++ } ++ else ++ { ++ run_on_cpu(); ++ } ++} ++ ++void CLTopKV2::run_on_gpu_single_quicksort() ++{ ++ // This is a single threaded quick sort implementation. ++ CLScheduler::get().enqueue(_qs_kernel, false); ++ ++ arm_compute::CLScheduler::get().sync(); ++} ++ ++void CLTopKV2::run_on_gpu() ++{ ++ cl::CommandQueue q = CLScheduler::get().queue(); ++ ++ //1. CLTopKV2Init set key buffer and index buffer. ++ // - Key buffer is set as the same value of the layer's input ++ // - Values in the index buffer are set as their indices. ++ CLScheduler::get().enqueue(_init_kernel, false); ++ ++ int n_passes = _total_bits / _bits; ++ ++ // 2. Repeat (total_bits/bits) times. ++ // - total_bits is the number of bits of the data type (e.g., 32 for float) ++ // - bits defines number of buckets (e.g. 16 buckets where bit is 4) ++ for(int pass = 0; pass < n_passes; ++pass) { ++ arm_compute::CLScheduler::get().sync(); ++ ++ // 2.1. Calculate histogram with _GROUPS * _ITEMS threads ++ _hist_kernel.setPass(pass, _p_in_key_buf); ++ CLScheduler::get().enqueue(_hist_kernel, false); ++ ++ // 2.2. Calculate prefix sum locally with multiple threads ++ CLScheduler::get().enqueue(_scan_hist_kernel, false); ++ // 2.3. Calculate prefix sum within a work group ++ CLScheduler::get().enqueue(_glob_scan_hist_kernel, false); ++ // 2.4. Calculate global prefix sum ++ CLScheduler::get().enqueue(_paste_hist_kernel, false); ++ ++ // 2.5. Reorder keys and indices based on the global prefix sum ++ _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, ++ _p_in_ind_buf, _p_out_ind_buf); ++ CLScheduler::get().enqueue(_reorder_kernel, false); ++ ++ cl::Buffer *tmp; ++ // swap key buffers ++ tmp = _p_in_key_buf; ++ _p_in_key_buf = _p_out_key_buf; ++ _p_out_key_buf = tmp; ++ ++ // swap index buffers ++ tmp = _p_in_ind_buf; ++ _p_in_ind_buf = _p_out_ind_buf; ++ _p_out_ind_buf = tmp; ++ } ++ ++ // 3. Get the first negative index ++ // Because we swap in_buf and out_buf at the end of the above for loop, ++ // the output buffers are in bufs. ++ _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf); ++ CLScheduler::get().enqueue(_find_first_negative_kernel, false); ++ ++ // 4. Correct odering of negatives ++ // - Since radix sort does not consider negatives, negatives are considered as bigger values than positives. ++ // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf ++ _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, ++ _p_in_ind_buf, _p_out_ind_buf); ++ CLScheduler::get().enqueue(_reorder_negatives_kernel, false); ++ ++ // 5. Extract top k values from sorted keys and indices. ++ _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf); ++ CLScheduler::get().enqueue(_store_kernel, false); ++ ++ arm_compute::CLScheduler::get().sync(); ++ ++#if 0 ++ // below code is left for debugging. ++ int first_neg; ++ q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg); ++ std::cout << "first neg = " << first_neg << std::endl; ++ ++ float in_key[_n]; ++ q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key); ++ for(uint32_t i = 0 ; i < _n; ++i) { ++ std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl; ++ } ++ ++ float out_key[_n]; ++ q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key); ++ for(uint32_t i = 0 ; i < _n; ++i) { ++ std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl; ++ } ++ ++ int in_ind[_n]; ++ q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind); ++ for(uint32_t i = 0 ; i < _n; ++i) { ++ std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl; ++ } ++ ++ int out_ind[_n]; ++ q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind); ++ for(uint32_t i = 0 ; i < _n; ++i) { ++ std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl; ++ } ++ ++ int hist_buf[_hist_buf_size]; ++ q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf); ++ for(uint32_t i = 0 ; i < _hist_buf_size; ++i) { ++ std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl; ++ } ++ ++ int glob_sum_buf[_glob_sum_buf_size]; ++ q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf); ++ for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) { ++ std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl; ++ } ++ ++#endif ++} ++ ++void CLTopKV2::run_on_cpu() ++{ ++ cl::CommandQueue q = CLScheduler::get().queue(); ++ //const Window& w = _topkv2_kernel.window(); ++ ++ _input->map(q); ++ _values->map(q); ++ _indices->map(q); ++ ++ //int row_size = (w[0].end() - w[0].start()) / w[0].step(); ++ int row_size = _input->info()->tensor_shape()[0]; ++ int rank = _input->info()->num_dimensions(); ++ ++ if (rank > 2) ++ throw std::runtime_error("Not supported type."); ++ ++ int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1); ++ ++ if (_input->info()->data_type() == DataType::F32) ++ { ++ nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float*)_input->buffer(), _k, ++ (int32*)_indices->buffer(), (float*)_values->buffer()); ++ } ++ else if (_input->info()->data_type() == DataType::S32) ++ { ++ nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t*)_input->buffer(), _k, ++ (int32*)_indices->buffer(), (int32_t*)_values->buffer()); ++ } ++ else if (_input->info()->data_type() == DataType::QASYMM8) ++ { ++ nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t*)_input->buffer(), _k, ++ (int32*)_indices->buffer(), (uint8_t*)_values->buffer()); ++ } ++ else ++ { ++ throw std::runtime_error("Not supported type."); ++ } ++ ++ _input->unmap(q); ++ _values->unmap(q); ++ _indices->unmap(q); ++} ++} // namespace arm_compute +diff --git a/src/runtime/topk_v2.h b/src/runtime/topk_v2.h +new file mode 100644 +index 0000000..2419ee9 +--- /dev/null ++++ b/src/runtime/topk_v2.h +@@ -0,0 +1,141 @@ ++/* ++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved ++ * Copyright (C) 2017 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ ++#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ ++ ++typedef int32_t int32; ++ ++namespace nnfw ++{ ++namespace rt ++{ ++namespace optimized_ops ++{ ++// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. ++// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than ++// TFLite. ++//(TFLite additionaly supports kTfLiteInt64.) ++ ++// The class that collects top indexes of k values. Based on template ++// tensorflow::gtl::TopN<> but, for optimization, ++// it re-uses the same container. ++template <typename T> class TopContainer ++{ ++public: ++ TopContainer() = delete; ++ TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) ++ { container_.reserve(std::min(k, row_size) + 1); } ++ ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ TopContainer(const TopContainer&) = delete; ++ /** Prevent instances of this class from being copied (As this class contains pointers) */ ++ TopContainer& operator=(const TopContainer&) = delete; ++ ++ void start_collecting(const T *values) ++ { ++ values_ = values; ++ container_.clear(); ++ } ++ ++ void push(int32 a) ++ { ++ auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; ++ if (container_.size() <= (size_t)k_) ++ { ++ container_.push_back(a); ++ if (container_.size() == (size_t)(k_ + 1)) ++ { ++ std::make_heap(container_.begin(), container_.end(), comparator); ++ std::pop_heap(container_.begin(), container_.end(), comparator); ++ } ++ } ++ else if (comparator(a, container_.front())) ++ { ++ container_.back() = a; ++ std::push_heap(container_.begin(), container_.end(), comparator); ++ std::pop_heap(container_.begin(), container_.end(), comparator); ++ } ++ } ++ ++ const std::vector<int32> &sorted_result() ++ { ++ auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; ++ if (container_.size() <= (size_t)(k_)) ++ { ++ std::sort(container_.begin(), container_.end(), comparator); ++ } ++ else ++ { ++ std::sort_heap(container_.begin(), container_.end() - 1, comparator); ++ container_.resize(k_); ++ } ++ return container_; ++ } ++ ++private: ++ int32 k_; ++ std::vector<int32> container_; ++ const T *values_ = nullptr; ++ ++ bool compare_fun(int32 a, int32 b) const ++ { ++ if (values_[b] < values_[a]) ++ { ++ return true; ++ } ++ else if (values_[b] > values_[a]) ++ { ++ return false; ++ } ++ else ++ { ++ return a < b; ++ } ++ } ++}; ++ ++template <typename T> ++void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, ++ T *output_values) ++{ ++ TopContainer<T> topc(k, row_size); ++ for (int row = 0; row < num_rows; ++row) ++ { ++ const T *values_row = data + row * row_size; ++ topc.start_collecting(values_row); ++ for (int32 c = 0; c < row_size; ++c) ++ { ++ topc.push(c); ++ } ++ ++ // Prepare output buffers. ++ int32 *indexes_row = output_indexes + row * k; ++ T *output_row = output_values + row * k; ++ // We always assume that the output is sorted. ++ const auto &top_k = topc.sorted_result(); ++ std::copy(top_k.begin(), top_k.end(), indexes_row); ++ std::transform(top_k.begin(), top_k.end(), output_row, ++ [values_row](const int32 loc) { return values_row[loc]; }); ++ } ++} ++ ++} // namespace optimized_ops ++} // namespace rt ++} // namespace nnfw ++ ++#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ +-- +1.9.1 + |