3 files changed, 7189 insertions, 0 deletions
diff --git a/packaging/libarmcl.manifest b/packaging/libarmcl.manifest
new file mode 100644
index 000000000..017d22d3a
--- /dev/null
+++ b/packaging/libarmcl.manifest
@@ -0,0 +1,5 @@
+<manifest>
+ <request>
+    <domain name="_"/>
+ </request>
+</manifest>
diff --git a/packaging/libarmcl.spec b/packaging/libarmcl.spec
new file mode 100644
index 000000000..092c4e238
--- /dev/null
+++ b/packaging/libarmcl.spec
@@ -0,0 +1,159 @@
+Name:           libarmcl
+Version:        v19.05
+Release:        0
+License:        MIT
+Url:            https://github.com/ARM-software/ComputeLibrary
+Summary:        The ARM Computer Vision and Machine Learning library
+Group:          Graphics & UI Framework/Libraries
+Source0:        %{name}-%{version}.tar.bz2
+Source1001:	%name.manifest
+ExclusiveArch:	%{arm} aarch64
+
+BuildRequires: python3-base
+BuildRequires:	python
+BuildRequires: scons
+
+%define OPEN_CL_SUPPORT		1
+%define NEON_SUPPORT		1
+%define BENCHMARK_TEST		1
+
+%description
+The ARM Computer Vision and Machine Learning library is a set of functions optimised for both ARM CPUs and GPUs using SIMD technologies
+
+%package -n %{name}-release
+Summary:        ARM Compute Library file
+
+%description -n %{name}-release
+Summary:        ARM Compute Library file
+
+%package -n %{name}-devel
+Summary:        Userspace interface to ARM Compute Library
+
+%description -n %{name}-devel
+Summary:        Userspace interface to ARM Compute Library
+
+%package -n %{name}-tools
+Summary:	Sample application and benchmark binaries to test ARM Compute Library
+
+%description -n %{name}-tools
+Summary:	Sample application and benchmark binaries to test ARM Compute Library
+
+%prep
+%setup -q
+cp %{SOURCE1001} .
+
+%build
+echo %{_builddir}
+scons -j8 		\
+	Werror=0 	\
+	debug=0  	\
+%if 0%{?NEON_SUPPORT} == 1
+	neon=1		\
+%endif
+%if 0%{?OPEN_CL_SUPPORT} == 1
+	opencl=1	\
+%endif
+	os=linux	\
+%ifarch aarch64
+	arch=arm64-v8.2-a	\
+%else
+	arch=armv7a	\
+%endif
+	embed_kernels=1 \
+%if 0%{?BENCHMARK_TEST} == 1
+	benchmark_tests=1
+%endif
+
+%post -p /sbin/ldconfig
+
+%postun -p /sbin/ldconfig
+
+%install
+mkdir -p %{buildroot}%{_libdir}
+mkdir -p %{buildroot}%{_libdir}/data
+mkdir -p %{buildroot}%{_bindir}
+mkdir -p %{buildroot}/usr/include/arm_compute
+mkdir -p %{buildroot}/usr/include/support
+mkdir -p %{buildroot}/usr/include/CL
+mkdir -p %{buildroot}/usr/include/half
+mkdir -p %{buildroot}/usr/include/libnpy
+
+install -m 644 build/libarm_compute_core.so %{buildroot}%{_libdir}
+install -m 644 build/libarm_compute.so %{buildroot}%{_libdir}
+install -m 644 build/libarm_compute_graph.so %{buildroot}%{_libdir}
+install -m 644 build/opencl-1.2-stubs/libOpenCL.so %{buildroot}%{_libdir}
+
+install -m 644 build/examples/cl_convolution %{buildroot}%{_bindir}
+install -m 644 build/examples/cl_events %{buildroot}%{_bindir}
+install -m 644 build/examples/cl_sgemm %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_alexnet %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_googlenet %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_inception_v3 %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_inception_v4 %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_lenet %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_mobilenet %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_resnet50 %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_resnext50 %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_squeezenet %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_squeezenet_v1_1 %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_vgg16 %{buildroot}%{_bindir}
+install -m 644 build/examples/graph_vgg19 %{buildroot}%{_bindir}
+install -m 644 build/examples/neon_cartoon_effect %{buildroot}%{_bindir}
+install -m 644 build/examples/neoncl_scale_median_gaussian %{buildroot}%{_bindir}
+install -m 644 build/examples/neon_cnn %{buildroot}%{_bindir}
+install -m 644 build/examples/neon_convolution %{buildroot}%{_bindir}
+install -m 644 build/examples/neon_copy_objects %{buildroot}%{_bindir}
+install -m 644 build/examples/neon_scale %{buildroot}%{_bindir}
+
+cp -r %{_builddir}/%{name}-%{version}/arm_compute/* %{buildroot}/usr/include/arm_compute/
+cp -r %{_builddir}/%{name}-%{version}/support/* %{buildroot}/usr/include/support/
+cp -r %{_builddir}/%{name}-%{version}/include/CL/* %{buildroot}/usr/include/CL/
+cp -r %{_builddir}/%{name}-%{version}/include/half/* %{buildroot}/usr/include/half/
+cp -r %{_builddir}/%{name}-%{version}/include/libnpy/* %{buildroot}/usr/include/libnpy/
+
+%if 0%{?BENCHMARK_TEST} == 1
+install -m 644 %{_builddir}/%{name}-%{version}/build/tests/arm_compute_benchmark %{buildroot}%{_bindir}
+cp -r %{_builddir}/%{name}-%{version}/data/* %{buildroot}%{_libdir}/data/
+%endif
+
+%files -n %{name}-release
+%manifest %{name}.manifest
+%{_libdir}/libarm_compute*.so
+
+%files -n %{name}-devel
+%manifest %{name}.manifest
+%{_libdir}/libarm_compute*.so
+%{_libdir}/libOpenCL.so
+%{_includedir}/arm_compute/*
+%{_includedir}/support/*
+%{_includedir}/CL/*
+%{_includedir}/half/*
+%{_includedir}/libnpy/*
+
+%files -n %{name}-tools
+%manifest %{name}.manifest
+%{_bindir}/cl_convolution
+%{_bindir}/cl_events
+%{_bindir}/cl_sgemm
+%{_bindir}/graph_alexnet
+%{_bindir}/graph_googlenet
+%{_bindir}/graph_inception_v3
+%{_bindir}/graph_inception_v4
+%{_bindir}/graph_lenet
+%{_bindir}/graph_mobilenet
+%{_bindir}/graph_resnet50
+%{_bindir}/graph_resnext50
+%{_bindir}/graph_squeezenet
+%{_bindir}/graph_squeezenet_v1_1
+%{_bindir}/graph_vgg16
+%{_bindir}/graph_vgg19
+%{_bindir}/neon_cartoon_effect
+%{_bindir}/neoncl_scale_median_gaussian
+%{_bindir}/neon_cnn
+%{_bindir}/neon_convolution
+%{_bindir}/neon_copy_objects
+%{_bindir}/neon_scale
+%if 0%{?BENCHMARK_TEST} == 1
+%{_bindir}/arm_compute_benchmark
+%{_libdir}/data/*
+%endif
diff --git a/packaging/patch.patch b/packaging/patch.patch
new file mode 100644
index 000000000..86a046cd4
--- /dev/null
+++ b/packaging/patch.patch
@@ -0,0 +1,7025 @@
+From eb0682abf46a5d1ee1c4bfc780815f948c912aca Mon Sep 17 00:00:00 2001
+From: Chunseok Lee <chunseok.lee@samsung.com>
+Date: Thu, 23 Aug 2018 17:42:09 +0900
+Subject: [PATCH] Patch for NNFW M2 Release
+
+1. Add new operations
+2. Fix some issue on existing ops
+
+Change-Id: I8da858291993ba474c8d285d8c63e75f5cf37083
+Signed-off-by: Chunseok Lee <chunseok.lee@samsung.com>
+---
+ .../core/CL/kernels/CLArithmeticAdditionKernel.h   |  12 +-
+ .../CL/kernels/CLArithmeticSubtractionKernel.h     |   2 +
+ arm_compute/core/CL/kernels/CLCastKernel.h         |  65 +++
+ arm_compute/core/CL/kernels/CLGatherKernel.h       |  77 ++++
+ .../core/CL/kernels/CLPixelWiseDivisionKernel.h    |  88 ++++
+ .../CL/kernels/CLPixelWiseMultiplicationKernel.h   |   8 +-
+ arm_compute/core/CL/kernels/CLReduceMaxKernel.h    |  78 ++++
+ .../core/CL/kernels/CLReductionMeanKernel.h        |  83 ++++
+ arm_compute/core/CL/kernels/CLStridedSliceKernel.h | 106 +++++
+ arm_compute/core/CL/kernels/CLTopKV2Kernel.h       | 309 +++++++++++++
+ arm_compute/core/Helpers.inl                       |  33 ++
+ arm_compute/runtime/CL/CLFunctions.h               |   8 +
+ .../runtime/CL/functions/CLArithmeticAddition.h    |  12 +-
+ .../runtime/CL/functions/CLArithmeticSubtraction.h |  13 +-
+ arm_compute/runtime/CL/functions/CLCast.h          |  52 +++
+ arm_compute/runtime/CL/functions/CLGather.h        |  56 +++
+ .../runtime/CL/functions/CLPixelWiseDivision.h     |  71 +++
+ .../CL/functions/CLPixelWiseMultiplication.h       |   8 +-
+ arm_compute/runtime/CL/functions/CLReduceMax.h     |  89 ++++
+ arm_compute/runtime/CL/functions/CLReductionMean.h |  76 ++++
+ arm_compute/runtime/CL/functions/CLStridedSlice.h  |  73 ++++
+ arm_compute/runtime/CL/functions/CLTopKV2.h        | 115 +++++
+ src/core/CL/CLKernelLibrary.cpp                    |  72 ++++
+ src/core/CL/cl_kernels/activation_layer_qa8.cl     | 107 ++++-
+ src/core/CL/cl_kernels/arithmetic_op_quantized.cl  | 138 ++++++
+ src/core/CL/cl_kernels/cast.cl                     | 148 +++++++
+ src/core/CL/cl_kernels/fixed_point.h               |  24 ++
+ src/core/CL/cl_kernels/gather.cl                   | 106 +++++
+ src/core/CL/cl_kernels/pixelwise_div_float.cl      |  96 +++++
+ src/core/CL/cl_kernels/pixelwise_div_int.cl        | 103 +++++
+ src/core/CL/cl_kernels/pixelwise_mul_quantized.cl  | 119 +++++
+ src/core/CL/cl_kernels/reduce_max.cl               |  60 +++
+ src/core/CL/cl_kernels/reduction_mean.cl           |  69 +++
+ src/core/CL/cl_kernels/strided_slice.cl            | 104 +++++
+ src/core/CL/cl_kernels/topkv2.cl                   | 111 +++++
+ src/core/CL/cl_kernels/topkv2_quicksort.cl         | 138 ++++++
+ src/core/CL/cl_kernels/topkv2_radixsort.cl         | 279 ++++++++++++
+ src/core/CL/kernels/CLActivationLayerKernel.cpp    |  53 ++-
+ src/core/CL/kernels/CLArithmeticAdditionKernel.cpp |  46 +-
+ .../CL/kernels/CLArithmeticSubtractionKernel.cpp   | 125 ++++--
+ src/core/CL/kernels/CLCastKernel.cpp               | 115 +++++
+ src/core/CL/kernels/CLGatherKernel.cpp             | 147 +++++++
+ src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp  | 284 ++++++++++++
+ .../CL/kernels/CLPixelWiseMultiplicationKernel.cpp |  37 +-
+ src/core/CL/kernels/CLReduceMaxKernel.cpp          | 135 ++++++
+ src/core/CL/kernels/CLReductionMeanKernel.cpp      | 190 ++++++++
+ src/core/CL/kernels/CLStridedSliceKernel.cpp       | 316 ++++++++++++++
+ src/core/CL/kernels/CLTopKV2Kernel.cpp             | 479 +++++++++++++++++++++
+ src/core/Validate.cpp                              |   2 +-
+ .../CL/functions/CLArithmeticSubtraction.cpp       |  14 +-
+ src/runtime/CL/functions/CLCast.cpp                |  37 ++
+ src/runtime/CL/functions/CLGather.cpp              |  45 ++
+ src/runtime/CL/functions/CLPixelWiseDivision.cpp   |  57 +++
+ src/runtime/CL/functions/CLReduceMax.cpp           | 132 ++++++
+ src/runtime/CL/functions/CLReductionMean.cpp       |  60 +++
+ src/runtime/CL/functions/CLStridedSlice.cpp        | 288 +++++++++++++
+ src/runtime/CL/functions/CLTopKV2.cpp              | 310 +++++++++++++
+ src/runtime/topk_v2.h                              | 141 ++++++
+ 58 files changed, 6038 insertions(+), 83 deletions(-)
+ create mode 100644 arm_compute/core/CL/kernels/CLCastKernel.h
+ create mode 100644 arm_compute/core/CL/kernels/CLGatherKernel.h
+ create mode 100644 arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
+ create mode 100644 arm_compute/core/CL/kernels/CLReduceMaxKernel.h
+ create mode 100644 arm_compute/core/CL/kernels/CLReductionMeanKernel.h
+ create mode 100644 arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+ create mode 100644 arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+ create mode 100644 arm_compute/runtime/CL/functions/CLCast.h
+ create mode 100644 arm_compute/runtime/CL/functions/CLGather.h
+ create mode 100644 arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+ create mode 100644 arm_compute/runtime/CL/functions/CLReduceMax.h
+ create mode 100644 arm_compute/runtime/CL/functions/CLReductionMean.h
+ create mode 100644 arm_compute/runtime/CL/functions/CLStridedSlice.h
+ create mode 100644 arm_compute/runtime/CL/functions/CLTopKV2.h
+ create mode 100644 src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+ create mode 100644 src/core/CL/cl_kernels/cast.cl
+ create mode 100644 src/core/CL/cl_kernels/gather.cl
+ create mode 100644 src/core/CL/cl_kernels/pixelwise_div_float.cl
+ create mode 100644 src/core/CL/cl_kernels/pixelwise_div_int.cl
+ create mode 100644 src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+ create mode 100644 src/core/CL/cl_kernels/reduce_max.cl
+ create mode 100644 src/core/CL/cl_kernels/reduction_mean.cl
+ create mode 100644 src/core/CL/cl_kernels/strided_slice.cl
+ create mode 100644 src/core/CL/cl_kernels/topkv2.cl
+ create mode 100644 src/core/CL/cl_kernels/topkv2_quicksort.cl
+ create mode 100644 src/core/CL/cl_kernels/topkv2_radixsort.cl
+ create mode 100644 src/core/CL/kernels/CLCastKernel.cpp
+ create mode 100644 src/core/CL/kernels/CLGatherKernel.cpp
+ create mode 100644 src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+ create mode 100644 src/core/CL/kernels/CLReduceMaxKernel.cpp
+ create mode 100644 src/core/CL/kernels/CLReductionMeanKernel.cpp
+ create mode 100644 src/core/CL/kernels/CLStridedSliceKernel.cpp
+ create mode 100644 src/core/CL/kernels/CLTopKV2Kernel.cpp
+ create mode 100644 src/runtime/CL/functions/CLCast.cpp
+ create mode 100644 src/runtime/CL/functions/CLGather.cpp
+ create mode 100644 src/runtime/CL/functions/CLPixelWiseDivision.cpp
+ create mode 100644 src/runtime/CL/functions/CLReduceMax.cpp
+ create mode 100644 src/runtime/CL/functions/CLReductionMean.cpp
+ create mode 100644 src/runtime/CL/functions/CLStridedSlice.cpp
+ create mode 100644 src/runtime/CL/functions/CLTopKV2.cpp
+ create mode 100644 src/runtime/topk_v2.h
+
+diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
+index 5112476..017650f 100644
+--- a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
++++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
+@@ -53,17 +53,17 @@ public:
+     ~CLArithmeticAdditionKernel() = default;
+     /** Initialise the kernel's inputs, output and convertion policy.
+      *
+-     * @param[in]  input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32.
+-     * @param[in]  input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+-     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
++     * @param[in]  input1 First tensor input. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
++     * @param[in]  input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8(only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32.
++     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32.
+      * @param[in]  policy Policy to use to handle overflow.
+      */
+     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAdditionKernel
+      *
+-     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+-     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+-     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
++     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
++     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8(only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32.
++     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32.
+      * @param[in] policy Policy to use to handle overflow.
+      *
+      * @return a status
+diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
+index c5f862a..5e374a5 100644
+--- a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
++++ b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+  * Copyright (c) 2016, 2017 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+@@ -74,6 +75,7 @@ public:
+ 
+     // Inherited methods overridden:
+     void run(const Window &window, cl::CommandQueue &queue) override;
++    BorderSize border_size() const override;
+ 
+ private:
+     const ICLTensor *_input1; /**< Source tensor 1 */
+diff --git a/arm_compute/core/CL/kernels/CLCastKernel.h b/arm_compute/core/CL/kernels/CLCastKernel.h
+new file mode 100644
+index 0000000..19e482f
+--- /dev/null
++++ b/arm_compute/core/CL/kernels/CLCastKernel.h
+@@ -0,0 +1,65 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
++#define __ARM_COMPUTE_CLCASTKERNEL_H__
++
++#include "arm_compute/core/CL/ICLKernel.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** OpenCL kernel to perform a cast operation */
++class CLCastKernel : public ICLKernel
++{
++public:
++    /** Default constructor */
++    CLCastKernel();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLCastKernel(const CLCastKernel &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLCastKernel &operator=(const CLCastKernel &) = delete;
++    /** Allow instances of this class to be moved */
++    CLCastKernel(CLCastKernel &&) = default;
++    /** Allow instances of this class to be moved */
++    CLCastKernel &operator=(CLCastKernel &&) = default;
++    /** Default destructor */
++    ~CLCastKernel() = default;
++    /** Initialise the kernel's input and output.
++     *
++     * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
++     * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
++     */
++    void configure(const ICLTensor *input, ICLTensor *output);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    const ICLTensor *_input; /**< Source tensor */
++    ICLTensor       *_output; /**< Destination tensor */
++};
++} // namespace arm_compute
++#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
+diff --git a/arm_compute/core/CL/kernels/CLGatherKernel.h b/arm_compute/core/CL/kernels/CLGatherKernel.h
+new file mode 100644
+index 0000000..530491a
+--- /dev/null
++++ b/arm_compute/core/CL/kernels/CLGatherKernel.h
+@@ -0,0 +1,77 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__
++#define __ARM_COMPUTE_CLGATHERKERNEL_H__
++
++#include "arm_compute/core/CL/ICLKernel.h"
++#include "arm_compute/core/Types.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Interface for the gather kernel.
++ *
++ */
++class CLGatherKernel : public ICLKernel
++{
++public:
++    /** Default constructor.*/
++    CLGatherKernel();
++    /** Prevent instances of this class from being copied (As this class contains pointers). */
++    CLGatherKernel(const CLGatherKernel &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers). */
++    CLGatherKernel &operator=(const CLGatherKernel &) = delete;
++    /** Allow instances of this class to be moved */
++    CLGatherKernel(CLGatherKernel &&) = default;
++    /** Allow instances of this class to be moved */
++    CLGatherKernel &operator=(CLGatherKernel &&) = default;
++    /** Initialise the kernel's input, output and border mode.
++     *
++     * @param[in]  input1          An input tensor. Data types supported: U8/S32/F32.
++     * @param[in]  input2          An input tensor. Data types supported: S32.
++     * @param[out] output          The output tensor, Data types supported: same as @p input1.
++     */
++    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
++    /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
++     *
++     * @param[in]  input1          An input tensor. Data types supported: U8/S32/F32.
++     * @param[in]  input2          An input tensor. Data types supported: S32.
++     * @param[out] output          The output tensor, Data types supported: same as @p input1.
++     *
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    const ICLTensor *_input1;
++    const ICLTensor *_input2;
++    ICLTensor       *_output;
++};
++} // namespace arm_compute
++#endif /*__ARM_COMPUTE_CLGATHERKERNEL_H__ */
+diff --git a/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
+new file mode 100644
+index 0000000..2e542b3
+--- /dev/null
++++ b/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
+@@ -0,0 +1,88 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
++#define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
++
++#include "arm_compute/core/CL/ICLKernel.h"
++#include "arm_compute/core/Types.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Interface for the pixelwise division kernel.
++ *
++ */
++class CLPixelWiseDivisionKernel : public ICLKernel
++{
++public:
++    /** Default constructor.*/
++    CLPixelWiseDivisionKernel();
++    /** Prevent instances of this class from being copied (As this class contains pointers). */
++    CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers). */
++    CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete;
++    /** Allow instances of this class to be moved */
++    CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default;
++    /** Allow instances of this class to be moved */
++    CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default;
++    /** Initialise the kernel's input, output and border mode.
++     *
++     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
++     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     * @param[in]  scale           Scale to apply after division.
++     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
++     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
++     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
++     */
++    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
++                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
++    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseDivisionKernel
++     *
++     * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
++     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     * @param[in] scale           Scale to apply after division.
++     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
++     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
++     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
++     *
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
++                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++    BorderSize border_size() const override;
++
++private:
++    const ICLTensor *_input1;
++    const ICLTensor *_input2;
++    ICLTensor       *_output;
++};
++} // namespace arm_compute
++#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ */
+diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+index fcabb61..66c0b36 100644
+--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
++++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+@@ -49,9 +49,9 @@ public:
+     CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
+     /** Initialise the kernel's input, output and border mode.
+      *
+-     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
+      * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
+-     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). QASYMM8 requires both inputs are QASYMM8.
+      * @param[in]  scale           Scale to apply after multiplication.
+      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+      * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+@@ -61,9 +61,9 @@ public:
+                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
+      *
+-     * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
+      * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
+-     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). QASYMM8 requires both inputs are QASYMM8.
+      * @param[in] scale           Scale to apply after multiplication.
+      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+      * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+diff --git a/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
+new file mode 100644
+index 0000000..184389a
+--- /dev/null
++++ b/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
+@@ -0,0 +1,78 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
++#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
++
++#include "arm_compute/core/CL/ICLKernel.h"
++#include "arm_compute/core/Types.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Interface for the pixelwise division kernel.
++ *
++ */
++class CLReduceMaxKernel : public ICLKernel
++{
++public:
++    /** Default constructor.*/
++    CLReduceMaxKernel();
++    /** Prevent instances of this class from being copied (As this class contains pointers). */
++    CLReduceMaxKernel(const CLReduceMaxKernel &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers). */
++    CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete;
++    /** Allow instances of this class to be moved */
++    CLReduceMaxKernel(CLReduceMaxKernel &&) = default;
++    /** Allow instances of this class to be moved */
++    CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
++    /** Initialise the kernel's input, output and border mode.
++     *
++     * @param[in]  input          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in] axis            Axis to reduce
++     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     */
++    void configure(const ICLTensor *input, int32_t axis, ICLTensor *output);
++    /** Static function to check if given info will lead to a valid configuration of @ref CLReduceMaxKernel
++     *
++     * @param[in] input           An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in] axis            Axis to reduce
++     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     *
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++    void run_on_cpu(cl::CommandQueue &queue);
++
++private:
++    const ICLTensor *_input;
++    ICLTensor       *_output;
++    int32_t         _axis;
++};
++} // namespace arm_compute
++#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
+diff --git a/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
+new file mode 100644
+index 0000000..687fdb5
+--- /dev/null
++++ b/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
+@@ -0,0 +1,83 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
++#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
++
++#include "arm_compute/core/CL/ICLKernel.h"
++#include "arm_compute/core/Types.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Interface for the reduction operation kernel */
++class CLReductionMeanKernel : public ICLKernel
++{
++public:
++    /** Default constructor */
++    CLReductionMeanKernel();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLReductionMeanKernel(const CLReductionMeanKernel &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete;
++    /** Allow instances of this class to be moved */
++    CLReductionMeanKernel(CLReductionMeanKernel &&) = default;
++    /** Allow instances of this class to be moved */
++    CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default;
++    /** Default destructor */
++    ~CLReductionMeanKernel() = default;
++
++    /** Set the input and output tensors.
++     *
++     * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
++     * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
++     *                    Output will have the same number of dimensions as input.
++     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0, 1
++     */
++    void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
++
++    /** Static function to check if given info will lead to a valid configuration of @ref CLReductionMeanKernel.
++     *
++     * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
++     * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
++     *                   Output will have the same number of dimensions as input.
++     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0, 1
++     *
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++    BorderSize border_size() const override;
++
++private:
++    const ICLTensor   *_input;
++    ICLTensor         *_output;
++    std::vector<uint32_t> _reduction_axis;
++    BorderSize         _border_size;
++};
++} // namespace arm_compute
++#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */
+diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+new file mode 100644
+index 0000000..456c27d
+--- /dev/null
++++ b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+@@ -0,0 +1,106 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
++#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
++
++#include "arm_compute/core/CL/ICLKernel.h"
++#include "arm_compute/core/Types.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Interface for the kernel to extract a strided slice of a tensor */
++class CLStridedSliceKernel : public ICLKernel
++{
++public:
++    /** Default constructor */
++    CLStridedSliceKernel();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
++    /** Allow instances of this class to be moved */
++    CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
++    /** Allow instances of this class to be moved */
++    CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
++    /** Default destructor */
++    ~CLStridedSliceKernel() = default;
++    /** Set the input and output of the kernel
++     *
++     * @param[in]  input          Source tensor. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
++     * @param[out] output         Destination tensor. Data type supported: Same as @p input
++     * @param[in]  beginData      The begin tensor. Data types supported: S32.
++     *                            The number of dimensions must be 1.
++     *                            The length must be the same as the number of dimensions of input.
++     * @param[in]  endData        The end tensor. Data types supported: S32.
++     *                            The number of dimensions must be 1.
++     *                            The length must be the same as the number of dimensions of input.
++     * @param[in]  strideData     The stride tensor. Data types supported: S32.
++     *                            The number of dimensions must be 1.
++     *                            The length must be the same as the number of dimensions of input.
++     * @param[in]  beginMask      Mask for begin
++     * @param[in]  endMask        Mask for end
++     * @param[in]  shrinkAxisMask Mask for shrink axis.
++     *
++     */
++    void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask);
++
++    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
++     *
++     * @param[in]  input          The input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
++     * @param[in]  output         The output tensor info, Data types supported: same as @p input1.
++     * @param[in]  begin          The begin tensor info. Data types supported: S32.
++     *                            The number of dimensions must be 1.
++     *                            The length must be the same as the number of dimensions of input.
++     * @param[in]  end            The end tensor info. Data types supported: S32.
++     *                            The number of dimensions must be 1.
++     *                            The length must be the same as the number of dimensions of input.
++     * @param[in]  stride         The stride tensor info. Data types supported: S32.
++     *                            The number of dimensions must be 1.
++     *                            The length must be the same as the number of dimensions of input.
++     * @param[in]  beginMask      Mask for begin
++     * @param[in]  endMask        Mask for end
++     * @param[in]  shrinkAxisMask Mask for shrink axis.
++     *
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *begin, const ITensorInfo *end, const ITensorInfo *stride, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    const ICLTensor *_input;      /** Source tensor */
++    ICLTensor       *_output;     /** Destination tensor */
++    ICLTensor *_beginData;        /** Start indices of input tensor */
++    ICLTensor *_endData;          /** Stop indices of input tensor */
++    ICLTensor *_stridesData;      /** Strides tensor */
++    int32_t _beginMask;           /** Begin mask */
++    int32_t _endMask;             /** End mask */
++    int32_t _shrinkAxisMask;      /** Shrink axis mask */
++};
++} // namespace arm_compute
++#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */
+diff --git a/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+new file mode 100644
+index 0000000..09bcfe5
+--- /dev/null
++++ b/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+@@ -0,0 +1,309 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
++#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
++
++#include "arm_compute/core/CL/ICLArray.h"
++#include "arm_compute/core/CL/ICLKernel.h"
++
++#include <array>
++
++// these parameters can be changed
++#define _ITEMS  16 // number of items in a group
++#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS
++#define _HISTOSPLIT (_ITEMS*_GROUPS/2) // number of splits of the histogram
++#define PERMUT  // store the final permutation
++////////////////////////////////////////////////////////
++
++namespace arm_compute
++{
++class ICLTensor;
++
++class CLTopKV2Single : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLTopKV2Single();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2Single(const CLTopKV2Single &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
++    /** Allow instances of this class to be moved */
++    CLTopKV2Single(CLTopKV2Single &&) = default;
++    /** Allow instances of this class to be moved */
++    CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
++
++    void configure(ICLTensor *input, ICLTensor *topk_values,
++        ICLTensor *topk_indices, cl::Buffer *indices,
++        cl::Buffer *temp_stack, int k, int n);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    ICLTensor *_input;
++    ICLTensor *_topk_values;
++    ICLTensor *_topk_indices;
++};
++
++class CLTopKV2Init : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLTopKV2Init();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2Init(const CLTopKV2Init &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
++    /** Allow instances of this class to be moved */
++    CLTopKV2Init(CLTopKV2Init &&) = default;
++    /** Allow instances of this class to be moved */
++    CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
++
++    void configure(ICLTensor *input, cl::Buffer* in_key_buf,
++        cl::Buffer* in_ind_buf, int n);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    ICLTensor *_input;
++};
++
++class CLRadixSortHistogram : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLRadixSortHistogram();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
++    /** Allow instances of this class to be moved */
++    CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
++    /** Allow instances of this class to be moved */
++    CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
++
++    void configure(cl::Buffer* hist_buf, int bits, int n);
++
++    void setPass(int pass, cl::Buffer *in_key_buf) {
++      _pass = pass;
++      _in_key_buf = in_key_buf;
++    }
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    int _pass;
++    cl::Buffer *_in_key_buf;
++};
++
++class CLRadixSortScanHistogram : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLRadixSortScanHistogram();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
++    /** Allow instances of this class to be moved */
++    CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
++    /** Allow instances of this class to be moved */
++    CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
++
++    void configure(cl::Buffer* hist_buf, cl::Buffer* glob_sum_buf, int bits);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++};
++
++class CLRadixSortGlobalScanHistogram : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLRadixSortGlobalScanHistogram();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
++    /** Allow instances of this class to be moved */
++    CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
++    /** Allow instances of this class to be moved */
++    CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
++
++    void configure(cl::Buffer* glob_sum_buf, cl::Buffer* temp_buf, int bits);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++};
++
++class CLRadixSortPasteHistogram : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLRadixSortPasteHistogram();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
++    /** Allow instances of this class to be moved */
++    CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
++    /** Allow instances of this class to be moved */
++    CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
++
++    void configure(cl::Buffer* hist_buf, cl::Buffer* glob_sum_buf, int bits);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++};
++
++class CLRadixSortReorder : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLRadixSortReorder();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortReorder(const CLRadixSortReorder &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
++    /** Allow instances of this class to be moved */
++    CLRadixSortReorder(CLRadixSortReorder &&) = default;
++    /** Allow instances of this class to be moved */
++    CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
++
++    void configure( cl::Buffer *hist_buf, int bits, int n);
++
++    void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf,
++        cl::Buffer *in_ind_buf, cl::Buffer *out_ind_buf) {
++      _pass = pass;
++      _in_key_buf = in_key_buf;
++      _out_key_buf = out_key_buf;
++      _in_ind_buf = in_ind_buf;
++      _out_ind_buf = out_ind_buf;
++    }
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    int _pass;
++    cl::Buffer *_in_key_buf;
++    cl::Buffer *_out_key_buf;
++    cl::Buffer *_in_ind_buf;
++    cl::Buffer *_out_ind_buf;
++};
++
++class CLTopKV2FindFirstNegative : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLTopKV2FindFirstNegative();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
++    /** Allow instances of this class to be moved */
++    CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
++    /** Allow instances of this class to be moved */
++    CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
++
++    void configure(cl::Buffer *first_negative_idx_buf, int n);
++
++    void setOutputBuffer(cl::Buffer* out_key_buf) {
++      _out_key_buf = out_key_buf;
++    }
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    cl::Buffer *_out_key_buf;
++};
++
++class CLTopKV2ReorderNegatives : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLTopKV2ReorderNegatives();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
++    /** Allow instances of this class to be moved */
++    CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
++    /** Allow instances of this class to be moved */
++    CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
++
++    void configure(cl::Buffer *first_negative_idx_buf, int n);
++
++    void setBuffers(cl::Buffer *in_key_buf, cl::Buffer* out_key_buf,
++        cl::Buffer *in_ind_buf, cl::Buffer *out_ind_buf) {
++      _in_key_buf = in_key_buf;
++      _out_key_buf = out_key_buf;
++      _in_ind_buf = in_ind_buf;
++      _out_ind_buf = out_ind_buf;
++    }
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++
++private:
++    cl::Buffer *_in_key_buf;
++    cl::Buffer *_out_key_buf;
++    cl::Buffer *_in_ind_buf;
++    cl::Buffer *_out_ind_buf;
++};
++
++class CLTopKV2Store : public ICLKernel
++{
++public:
++    /** Constructor */
++    CLTopKV2Store();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2Store(const CLTopKV2Store &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
++    /** Allow instances of this class to be moved */
++    CLTopKV2Store(CLTopKV2Store &&) = default;
++    /** Allow instances of this class to be moved */
++    CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
++
++    void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
++
++    void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
++
++    // Inherited methods overridden:
++    void run(const Window &window, cl::CommandQueue &queue) override;
++private:
++    ICLTensor *_values;
++    ICLTensor *_indices;
++    cl::Buffer *_out_key_buf;
++    cl::Buffer *_out_ind_buf;
++};
++
++} // namespace arm_compute
++
++#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
+index b359811..b588d08 100644
+--- a/arm_compute/core/Helpers.inl
++++ b/arm_compute/core/Helpers.inl
+@@ -300,6 +300,39 @@ inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo q
+     return false;
+ }
+ 
++inline ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape, InterpolationPolicy policy, BorderSize border_size, bool border_undefined)
++{
++    const auto wr = static_cast<float>(dst_shape[0]) / static_cast<float>(src_info.tensor_shape()[0]);
++    const auto hr = static_cast<float>(dst_shape[1]) / static_cast<float>(src_info.tensor_shape()[1]);
++
++    ValidRegion valid_region{ Coordinates(), dst_shape, src_info.tensor_shape().num_dimensions() };
++
++    Coordinates &anchor = valid_region.anchor;
++    TensorShape &shape  = valid_region.shape;
++
++    anchor.set(0, (policy == InterpolationPolicy::BILINEAR
++                   && border_undefined) ?
++               ((static_cast<int>(src_info.valid_region().anchor[0] + border_size.left + 0.5f)) * wr - 0.5f) :
++               ((static_cast<int>(src_info.valid_region().anchor[0] + 0.5f)) * wr - 0.5f));
++    anchor.set(1, (policy == InterpolationPolicy::BILINEAR
++                   && border_undefined) ?
++               ((static_cast<int>(src_info.valid_region().anchor[1] + border_size.top + 0.5f)) * hr - 0.5f) :
++               ((static_cast<int>(src_info.valid_region().anchor[1] + 0.5f)) * hr - 0.5f));
++    float shape_out_x = (policy == InterpolationPolicy::BILINEAR
++                         && border_undefined) ?
++                        ((static_cast<int>(src_info.valid_region().anchor[0]) + static_cast<int>(src_info.valid_region().shape[0]) - 1) - 1 + 0.5f) * wr - 0.5f :
++                        ((static_cast<int>(src_info.valid_region().anchor[0]) + static_cast<int>(src_info.valid_region().shape[0])) + 0.5f) * wr - 0.5f;
++    float shape_out_y = (policy == InterpolationPolicy::BILINEAR
++                         && border_undefined) ?
++                        ((static_cast<int>(src_info.valid_region().anchor[1]) + static_cast<int>(src_info.valid_region().shape[1]) - 1) - 1 + 0.5f) * hr - 0.5f :
++                        ((static_cast<int>(src_info.valid_region().anchor[1]) + static_cast<int>(src_info.valid_region().shape[1])) + 0.5f) * hr - 0.5f;
++
++    shape.set(0, shape_out_x - anchor[0]);
++    shape.set(1, shape_out_y - anchor[1]);
++
++    return valid_region;
++}
++
+ inline Coordinates index2coords(const TensorShape &shape, int index)
+ {
+     int num_elements = shape.total_size();
+diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
+index fe90b09..8396b9f 100644
+--- a/arm_compute/runtime/CL/CLFunctions.h
++++ b/arm_compute/runtime/CL/CLFunctions.h
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+  * Copyright (c) 2016-2018 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+@@ -37,6 +38,7 @@
+ #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
+ #include "arm_compute/runtime/CL/functions/CLBox3x3.h"
+ #include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
++#include "arm_compute/runtime/CL/functions/CLCast.h"
+ #include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
+ #include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
+ #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
+@@ -62,6 +64,7 @@
+ #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
+ #include "arm_compute/runtime/CL/functions/CLFloor.h"
+ #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
++#include "arm_compute/runtime/CL/functions/CLGather.h"
+ #include "arm_compute/runtime/CL/functions/CLGEMM.h"
+ #include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
+ #include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
+@@ -94,11 +97,14 @@
+ #include "arm_compute/runtime/CL/functions/CLPermute.h"
+ #include "arm_compute/runtime/CL/functions/CLPhase.h"
+ #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
++#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h"
+ #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+ #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+ #include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+ #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
++#include "arm_compute/runtime/CL/functions/CLReduceMax.h"
+ #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
++#include "arm_compute/runtime/CL/functions/CLReductionMean.h"
+ #include "arm_compute/runtime/CL/functions/CLRemap.h"
+ #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+ #include "arm_compute/runtime/CL/functions/CLScale.h"
+@@ -107,6 +113,7 @@
+ #include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+ #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+ #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
++#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+ #include "arm_compute/runtime/CL/functions/CLTableLookup.h"
+ #include "arm_compute/runtime/CL/functions/CLThreshold.h"
+ #include "arm_compute/runtime/CL/functions/CLTranspose.h"
+@@ -115,5 +122,6 @@
+ #include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
+ #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+ #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
++#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+ 
+ #endif /* __ARM_COMPUTE_CLFUNCTIONS_H__ */
+diff --git a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
+index 5b2fc8c..86dc2ef 100644
+--- a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
++++ b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
+@@ -41,19 +41,19 @@ class CLArithmeticAddition : public ICLSimpleFunction
+ public:
+     /** Initialise the kernel's inputs, output and convertion policy.
+      *
+-     * @param[in, out] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in, out] input1 First tensor input. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
+      *                        The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+-     * @param[in, out] input2 Second tensor input. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
++     * @param[in, out] input2 Second tensor input. Data types supported: U8, QS8 (only if @p input1 is QS8), QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+      *                        The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+-     * @param[out]     output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
++     * @param[out]     output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32.
+      * @param[in]      policy Policy to use to handle overflow.
+      */
+     void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAddition
+      *
+-     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+-     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+-     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
++     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
++     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32.
++     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32.
+      * @param[in] policy Policy to use to handle overflow.
+      *
+      * @return a status
+diff --git a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
+index 0d3f5bc..6d76c70 100644
+--- a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
++++ b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+  * Copyright (c) 2016, 2017 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+@@ -42,12 +43,14 @@ class CLArithmeticSubtraction : public ICLSimpleFunction
+ public:
+     /** Initialise the kernel's inputs, output and convertion policy.
+      *
+-     * @param[in]  input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32.
+-     * @param[in]  input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+-     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+-     * @param[in]  policy Policy to use to handle overflow.
++     * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     *                        The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
++     * @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
++     *                        The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
++     * @param[out]     output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
++     * @param[in]      policy Policy to use to handle overflow.
+      */
+-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
++    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtraction
+      *
+      * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
+new file mode 100644
+index 0000000..49fd342
+--- /dev/null
++++ b/arm_compute/runtime/CL/functions/CLCast.h
+@@ -0,0 +1,52 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLCAST_H__
++#define __ARM_COMPUTE_CLCAST_H__
++
++#include "arm_compute/core/Types.h"
++#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Basic function to run @ref CLCastKernel
++ *
++ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
++ * @note The function converts the input tensor to the tensor of the output tensor's type.
++ */
++class CLCast : public ICLSimpleFunction
++{
++public:
++    /** Initialise the kernel's input and output.
++     *
++     * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
++     *                       The input tensor is [in, out] because its TensorInfo might be modified inside the kernel.
++     * @param[out]     output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
++     */
++    void configure(ICLTensor *input, ICLTensor *output);
++};
++}
++#endif /* __ARM_COMPUTE_CLCAST_H__ */
+diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h
+new file mode 100644
+index 0000000..1aae32e
+--- /dev/null
++++ b/arm_compute/runtime/CL/functions/CLGather.h
+@@ -0,0 +1,56 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLGATHER_H__
++#define __ARM_COMPUTE_CLGATHER_H__
++
++#include "arm_compute/core/Types.h"
++#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Basic function to run @ref CLGatherKernel. */
++class CLGather : public ICLSimpleFunction
++{
++public:
++    /** Initialise the kernel's inputs, output and convertion policy.
++     *
++     * @param[in] input1          An input tensor. Data types supported: U8/S32/F32.
++     * @param[in] input2          An indexes tensor. Data types supported: S32.
++     * @param[out]     output          The output tensor, Data types supported: same as @p input1. 
++     */
++    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
++    /** Static function to check if given info will lead to a valid configuration of @ref CLGather
++     *
++     * @param[in] input1          An input tensor. Data types supported: U8/S32/F32.
++     * @param[in] input2          An indexes tensor. Data types supported: S32.
++     * @param[out]     output          The output tensor, Data types supported: same as @p input1. 
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
++};
++}
++#endif /*__ARM_COMPUTE_CLGATHER_H__ */
+diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+new file mode 100644
+index 0000000..5008159
+--- /dev/null
++++ b/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+@@ -0,0 +1,71 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
++#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
++
++#include "arm_compute/core/Types.h"
++#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Basic function to run @ref CLPixelWiseDivisionKernel. */
++class CLPixelWiseDivision : public ICLSimpleFunction
++{
++public:
++    /** Initialise the kernel's inputs, output and convertion policy.
++     *
++     * @param[in, out] input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
++     * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
++     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
++     * @param[out]     output          The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     * @param[in]      scale           Scale to apply after multiplication.
++     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
++     * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
++     * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
++     */
++    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
++                   ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
++                   RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
++    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseDivision
++     *
++     * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
++     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     * @param[in] scale           Scale to apply after multiplication.
++     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
++     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
++     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
++     *
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output,
++        float scale = 1.f, ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
++        RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
++};
++}
++#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
+diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+index 75b67cd..3f2ffcd 100644
+--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
++++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+@@ -37,11 +37,11 @@ class CLPixelWiseMultiplication : public ICLSimpleFunction
+ public:
+     /** Initialise the kernel's inputs, output and convertion policy.
+      *
+-     * @param[in, out] input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in, out] input1          An input tensor. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
+      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+      * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
+      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+-     * @param[out]     output          The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     * @param[out]     output          The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). QASYMM8 requires both inputs are QASYMM8.
+      * @param[in]      scale           Scale to apply after multiplication.
+      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+      * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+@@ -51,9 +51,9 @@ public:
+                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
+      *
+-     * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
++     * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
+      * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
+-     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
++     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). QASYMM8 requires both inputs are QASYMM8.
+      * @param[in] scale           Scale to apply after multiplication.
+      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+      * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+diff --git a/arm_compute/runtime/CL/functions/CLReduceMax.h b/arm_compute/runtime/CL/functions/CLReduceMax.h
+new file mode 100644
+index 0000000..9cce054
+--- /dev/null
++++ b/arm_compute/runtime/CL/functions/CLReduceMax.h
+@@ -0,0 +1,89 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
++#define __ARM_COMPUTE_CLREDUCE_MAX_H__
++
++#include "arm_compute/runtime/CL/CLArray.h"
++#include "arm_compute/runtime/IFunction.h"
++#include "arm_compute/core/Types.h"
++#include "arm_compute/core/CL/ICLKernel.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
++ *
++ * -# @ref CLTopKV2Kernel
++ */
++class CLReduceMax : public IFunction
++{
++public:
++    /** Constructor */
++    CLReduceMax();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLReduceMax(const CLReduceMax &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLReduceMax &operator=(const CLReduceMax &) = delete;
++    /** Allow instances of this class to be moved */
++    CLReduceMax(CLReduceMax &&) = default;
++    /** Allow instances of this class to be moved */
++    CLReduceMax &operator=(CLReduceMax &&) = default;
++    /** Initialise the kernel's inputs and outputs.
++     *
++     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
++     *
++     * @param[in]  input     Input image. Data types supported: F32
++     * @param[in]  axis      Axis to reduce. Data type supported: S32
++     * @param[out] output    indices related to top k values. Data types supported: F32.
++     */
++    void configure(ICLTensor *input, int32_t axis, ICLTensor *output);
++    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseDivision
++     *
++     * @param[in]  input     Input image. Data types supported: F32
++     * @param[in]  axis      Axis to reduce. Data type supported: S32
++     * @param[out] output    indices related to top k values. Data types supported: F32.     *
++     *
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
++
++    // Inherited methods overridden:
++    void run() override;
++
++private:
++
++    void run_on_cpu();
++
++    int32_t   _axis;
++
++    ICLTensor *_input;
++    ICLTensor *_output;
++
++    std::unique_ptr<ICLKernel> _kernel;
++
++};
++}
++#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
+diff --git a/arm_compute/runtime/CL/functions/CLReductionMean.h b/arm_compute/runtime/CL/functions/CLReductionMean.h
+new file mode 100644
+index 0000000..1f2a8b5
+--- /dev/null
++++ b/arm_compute/runtime/CL/functions/CLReductionMean.h
+@@ -0,0 +1,76 @@
++/*
++ * Copyright (c) 2017-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__
++#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__
++
++#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
++#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
++#include "arm_compute/core/Types.h"
++#include "arm_compute/runtime/CL/CLTensor.h"
++#include "arm_compute/runtime/IFunction.h"
++
++#include <cstdint>
++#include <memory>
++#include <vector>
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Perform reduction operation.
++ */
++class CLReductionMean : public IFunction
++{
++public:
++    /** Default Constructor.
++     */
++    CLReductionMean();
++
++    /** Set the input and output tensors.
++     *
++     * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
++     * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
++     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1 
++     */
++    void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
++
++    /** Static function to check if given info will lead to a valid configuration of @ref CLReductionMean.
++     *
++     * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
++     * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
++     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1
++     *
++     * @return a status
++     */
++    static Status validate(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis);
++
++    // Inherited methods overridden:
++    void run() override;
++
++private:
++    CLReductionMeanKernel _reduction_mean_kernel;
++    CLFillBorderKernel _fill_border_kernel;
++};
++}
++#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */
+diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h
+new file mode 100644
+index 0000000..4f765bd
+--- /dev/null
++++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h
+@@ -0,0 +1,73 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__
++#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__
++
++#include "arm_compute/runtime/IFunction.h"
++#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Basic function to run @ref CLStridedSliceKernel */
++class CLStridedSlice : public ICLSimpleFunction
++{
++public:
++    /** Initialise the kernel's inputs and outputs
++     *
++     * @param[in]  input  First tensor input. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
++     * @param[out] output Output tensor. Data type supported: Same as @p input
++     */
++    void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask);
++};
++
++class CLStridedSliceCPU : public IFunction
++{
++public:
++  /** Initialise inputs and outputs
++   *
++   * @param[in]  input  First tensor input.
++   * @param[out] output Output tensor.
++   */
++  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask);
++
++  void run() override;
++
++private:
++  void run_on_cpu();
++
++  ICLTensor *_input;
++  ICLTensor *_output;
++  ICLTensor *_beginData;
++  ICLTensor *_endData;
++  ICLTensor *_stridesData;
++  int32_t _beginMask;
++  int32_t _endMask;
++  int32_t _shrinkAxisMask;
++};
++
++}
++#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */
+diff --git a/arm_compute/runtime/CL/functions/CLTopKV2.h b/arm_compute/runtime/CL/functions/CLTopKV2.h
+new file mode 100644
+index 0000000..0dd4287
+--- /dev/null
++++ b/arm_compute/runtime/CL/functions/CLTopKV2.h
+@@ -0,0 +1,115 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLTOPK_V2_H__
++#define __ARM_COMPUTE_CLTOPK_V2_H__
++
++#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
++
++#include "arm_compute/runtime/CL/CLArray.h"
++#include "arm_compute/runtime/IFunction.h"
++
++namespace arm_compute
++{
++class ICLTensor;
++
++/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
++ *
++ * -# @ref CLTopKV2Kernel
++ */
++class CLTopKV2 : public IFunction
++{
++public:
++    /** Constructor */
++    CLTopKV2();
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2(const CLTopKV2 &) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++    CLTopKV2 &operator=(const CLTopKV2 &) = delete;
++    /** Allow instances of this class to be moved */
++    CLTopKV2(CLTopKV2 &&) = default;
++    /** Allow instances of this class to be moved */
++    CLTopKV2 &operator=(CLTopKV2 &&) = default;
++    /** Initialise the kernel's inputs and outputs.
++     *
++     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
++     *
++     * @param[in]  input     Input image. Data types supported: U8/S16/F32.
++     * @param[in]  k         The value of `k`.
++     * @param[out] values    Top k values. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
++     * @param[out] indices   indices related to top k values. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
++     */
++    void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
++        int total_bits = 32, int bits = 4);
++
++    // Inherited methods overridden:
++    void run() override;
++
++private:
++
++    void run_on_cpu();
++    void run_on_gpu();
++    void run_on_gpu_single_quicksort();
++
++    uint32_t   _k;
++    uint32_t   _total_bits;
++    uint32_t   _bits;
++    uint32_t   _radix;
++    uint32_t   _hist_buf_size;
++    uint32_t   _glob_sum_buf_size;
++    uint32_t   _n;
++
++    ICLTensor *_input;
++    ICLTensor *_values;
++    ICLTensor *_indices;
++
++    cl::Buffer _qs_idx_buf;
++    cl::Buffer _qs_temp_buf;
++    cl::Buffer _hist_buf;
++    cl::Buffer _glob_sum_buf;
++    cl::Buffer _temp_buf;
++    cl::Buffer _first_negative_idx_buf;
++    cl::Buffer _in_key_buf;
++    cl::Buffer _out_key_buf;
++    cl::Buffer _in_ind_buf;
++    cl::Buffer _out_ind_buf;
++
++    cl::Buffer *_p_in_key_buf;
++    cl::Buffer *_p_out_key_buf;
++    cl::Buffer *_p_in_ind_buf;
++    cl::Buffer *_p_out_ind_buf;
++
++    CLTopKV2Single                 _qs_kernel;
++    CLTopKV2Init                   _init_kernel;
++    CLRadixSortHistogram           _hist_kernel;
++    CLRadixSortScanHistogram       _scan_hist_kernel;
++    CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel;
++    CLRadixSortPasteHistogram      _paste_hist_kernel;
++    CLRadixSortReorder             _reorder_kernel;
++    CLTopKV2FindFirstNegative      _find_first_negative_kernel;
++    CLTopKV2ReorderNegatives       _reorder_negatives_kernel;
++    CLTopKV2Store                  _store_kernel;
++};
++}
++#endif // __ARM_COMPUTE_CLTOPK_V2_H__
+diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
+index bdb26f8..0c9f108 100644
+--- a/src/core/CL/CLKernelLibrary.cpp
++++ b/src/core/CL/CLKernelLibrary.cpp
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+  * Copyright (c) 2016-2018 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+@@ -149,14 +150,19 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
+     { "accumulate_weighted", "accumulate.cl" },
+     { "activation_layer", "activation_layer.cl" },
+     { "activation_layer_qa8", "activation_layer_qa8.cl" },
++    { "activation_layer_logistic_qa8", "activation_layer_qa8.cl" },
+     { "arithmetic_add", "arithmetic_op.cl" },
+     { "arithmetic_sub", "arithmetic_op.cl" },
++    { "arithmetic_add_qasymm8", "arithmetic_op_quantized.cl" },
+     { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" },
+     { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" },
+     { "bitwise_or", "bitwise_op.cl" },
+     { "bitwise_and", "bitwise_op.cl" },
+     { "bitwise_xor", "bitwise_op.cl" },
+     { "bitwise_not", "bitwise_op.cl" },
++    { "cast", "cast.cl" },
++    { "cast_qasymm_in", "cast.cl" },
++    { "cast_qasymm_out", "cast.cl" },
+     { "channel_combine_NV", "channel_combine.cl" },
+     { "channel_combine_RGB888", "channel_combine.cl" },
+     { "channel_combine_RGBA8888", "channel_combine.cl" },
+@@ -221,6 +227,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
+     { "fill_image_borders_replicate", "fill_border.cl" },
+     { "finalize", "optical_flow_pyramid_lk.cl" },
+     { "floor_layer", "floor.cl" },
++    { "gather", "gather.cl" },
++    { "gather_1d", "gather.cl" },
++    { "gather_1d_out", "gather.cl" },
+     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
+     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
+     { "gemm_accumulate_biases", "gemm.cl" },
+@@ -313,6 +322,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
+     { "permute_3201", "permute.cl" },
+     { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
+     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
++    { "pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl" },
++    { "pixelwise_div_float", "pixelwise_div_float.cl" },
++    { "pixelwise_div_int", "pixelwise_div_int.cl" },
+     { "pooling_layer_2", "pooling_layer.cl" },
+     { "pooling_layer_3", "pooling_layer.cl" },
+     { "pooling_layer_optimized_3", "pooling_layer.cl" },
+@@ -322,7 +334,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
+     { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
+     { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
+     { "quantization_layer", "quantization_layer.cl" },
++    { "reduce_max", "reduce_max.cl"},
+     { "reduction_operation", "reduction_operation.cl" },
++    { "reduction_mean", "reduction_mean.cl" },
+     { "remap_nearest_neighbour", "remap.cl" },
+     { "remap_bilinear", "remap.cl" },
+     { "reshape_layer", "reshape_layer.cl" },
+@@ -350,6 +364,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
+     { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" },
+     { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
+     { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
++    { "strided_slice", "strided_slice.cl" },
+     { "suppress_non_maximum", "canny.cl" },
+     { "tablelookup_U8", "tablelookup.cl" },
+     { "tablelookup_S16", "tablelookup.cl" },
+@@ -378,6 +393,15 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
+     { "YUYV422_to_NV12_bt709", "color_convert.cl" },
+     { "YUYV422_to_RGB888_bt709", "color_convert.cl" },
+     { "YUYV422_to_RGBA8888_bt709", "color_convert.cl" },
++    { "topkv2_init", "topkv2.cl" },
++    { "topkv2_find_first_negative", "topkv2.cl" },
++    { "topkv2_reorder_negatives", "topkv2.cl" },
++    { "topkv2_store", "topkv2.cl" },
++    { "radixsort_histogram", "topkv2_radixsort.cl" },
++    { "radixsort_scanhistograms", "topkv2_radixsort.cl" },
++    { "radixsort_pastehistograms", "topkv2_radixsort.cl" },
++    { "radixsort_reorder", "topkv2_radixsort.cl" },
++    { "topkv2_quicksort", "topkv2_quicksort.cl" },
+ };
+ 
+ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+@@ -404,6 +428,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+ #include "./cl_kernels/arithmetic_op.clembed"
+     },
+     {
++        "arithmetic_op_quantized.cl",
++#include "./cl_kernels/arithmetic_op_quantized.clembed"
++    },
++    {
+         "bitwise_op.cl",
+ #include "./cl_kernels/bitwise_op.clembed"
+     },
+@@ -412,6 +440,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+ #include "./cl_kernels/canny.clembed"
+     },
+     {
++        "cast.cl",
++#include "./cl_kernels/cast.clembed"
++    },
++    {
+         "channel_combine.cl",
+ #include "./cl_kernels/channel_combine.clembed"
+     },
+@@ -532,6 +564,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+ #include "./cl_kernels/floor.clembed"
+     },
+     {
++        "gather.cl",
++#include "./cl_kernels/gather.clembed"
++    },
++    {
+         "gaussian_pyramid.cl",
+ #include "./cl_kernels/gaussian_pyramid.clembed"
+     },
+@@ -636,6 +672,18 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+ #include "./cl_kernels/pixelwise_mul_int.clembed"
+     },
+     {
++        "pixelwise_mul_quantized.cl",
++#include "./cl_kernels/pixelwise_mul_quantized.clembed"
++    },
++    {
++        "pixelwise_div_float.cl",
++#include "./cl_kernels/pixelwise_div_float.clembed"
++    },
++    {
++        "pixelwise_div_int.cl",
++#include "./cl_kernels/pixelwise_div_int.clembed"
++    },
++    {
+         "pooling_layer.cl",
+ #include "./cl_kernels/pooling_layer.clembed"
+     },
+@@ -648,10 +696,18 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+ #include "./cl_kernels/quantization_layer.clembed"
+     },
+     {
++        "reduce_max.cl",
++#include "./cl_kernels/reduce_max.clembed"
++    },
++    {
+         "reduction_operation.cl",
+ #include "./cl_kernels/reduction_operation.clembed"
+     },
+     {
++        "reduction_mean.cl",
++#include "./cl_kernels/reduction_mean.clembed"
++    },
++    {
+         "remap.cl",
+ #include "./cl_kernels/remap.clembed"
+     },
+@@ -684,6 +740,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+ #include "./cl_kernels/softmax_layer_quantized.clembed"
+     },
+     {
++        "strided_slice.cl",
++#include "./cl_kernels/strided_slice.clembed"
++    },
++    {
+         "tablelookup.cl",
+ #include "./cl_kernels/tablelookup.clembed"
+     },
+@@ -715,6 +775,18 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+         "winograd.cl",
+ #include "./cl_kernels/winograd.clembed"
+     },
++    {
++        "topkv2.cl",
++#include "./cl_kernels/topkv2.clembed"
++    },
++    {
++        "topkv2_radixsort.cl",
++#include "./cl_kernels/topkv2_radixsort.clembed"
++    },
++    {
++        "topkv2_quicksort.cl",
++#include "./cl_kernels/topkv2_quicksort.clembed"
++    },
+ #endif /* EMBEDDED_KERNELS */
+ };
+ 
+diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
+index 66e54ed..5540932 100644
+--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
++++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl
+@@ -21,10 +21,17 @@
+  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  * SOFTWARE.
+  */
+-#include "helpers.h"
++#include "helpers_asymm.h"
+ 
+ #define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ 
++// Logistic Activation
++inline TYPE logistic_op(TYPE x)
++{
++    // This function is a temporary function that is not actually executed.
++    // To keep the existing structure, it is added.
++    return x;
++}
+ // RELU Activation
+ inline TYPE relu_op(TYPE x)
+ {
+@@ -119,4 +126,100 @@ __kernel void activation_layer_qa8(
+     (data, 0, (__global DATA_TYPE *)output.ptr);
+ }
+ 
+-#endif /* defined(ACT) */
+\ No newline at end of file
++#endif /* defined(ACT) */
++
++/** This performs a logistic activation function on QASYMM8 inputs.
++ *
++ * @note In order to perform the logistic activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
++ *
++ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
++ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
++ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
++ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
++ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
++ * @note Quantized can be optionally passed at compile time using -DINPUT_MULTIPLIER and -DINPUT_LEFT_SHIFT (if undefined, assume that the original data is used and not scaled separately.
++ * @note Number of integer bits should be given as a preprocessor argument using -DINPUT_INTEGER_BITS=value. e.g. -DINPUT_INTEGER_BITS=4.
++ * @note Number of input range radius should be given at compile time using -DINPUT_RANGE_RADIUS.
++ *
++ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
++ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
++ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
++ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
++ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
++ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
++ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
++ */
++__kernel void activation_layer_logistic_qa8(
++    TENSOR3D_DECLARATION(input)
++#ifndef IN_PLACE
++    ,
++    TENSOR3D_DECLARATION(output)
++#endif /* not IN_PLACE */
++)
++{
++    // Get pixels pointer
++    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
++#ifdef IN_PLACE
++    Tensor3D output = input;
++#else  /* IN_PLACE */
++    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
++#endif /* IN_PLACE */
++
++    // Load data
++    VEC_DATA_TYPE(int, 16)
++    data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), VEC_DATA_TYPE(int, 16));
++
++    VEC_DATA_TYPE(int, 16)
++    result = data;
++
++#if defined(INPUT_INTEGER_BITS) && defined(INPUT_RANGE_RADIUS)
++    const VEC_DATA_TYPE(int, 16) Q0_one      = INT_MAX;
++    const VEC_DATA_TYPE(int, 16) Q0_one_half = (1 << 30);
++
++    VEC_DATA_TYPE(int, 16)
++    input_val_centered = data;
++#ifdef O1_VAL
++    input_val_centered = data - O1_VAL;
++#endif /* O1_VAL */
++
++    VEC_DATA_TYPE(int, 16) result_left  = ASYMM_SELECT_USING_MASK(input_val_centered <= -INPUT_RANGE_RADIUS, 1, 0, 16);
++    VEC_DATA_TYPE(int, 16) result_right = ASYMM_SELECT_USING_MASK(input_val_centered >= INPUT_RANGE_RADIUS, 255, 0, 16);
++
++    VEC_DATA_TYPE(int, 16) input_mask         = ASYMM_SELECT_USING_MASK(input_val_centered > -INPUT_RANGE_RADIUS && input_val_centered < INPUT_RANGE_RADIUS, 1, 0, 16);
++    VEC_DATA_TYPE(int, 16) input_val_rescaled = input_val_centered * input_mask;
++#if defined(INPUT_MULTIPLIER) && defined(INPUT_LEFT_SHIFT)
++    if(INPUT_MULTIPLIER > 1)
++    {
++        input_val_rescaled   = ASYMM_MULT(input_val_rescaled * (1 << INPUT_LEFT_SHIFT), INPUT_MULTIPLIER, 16);
++    }
++#endif /* defined(INPUT_MULTIPLIER) && defined(INPUT_LEFT_SHIFT) */
++
++    VEC_DATA_TYPE(int, 16) mask_if_positive   = ASYMM_MASK_IF_NON_ZERO(input_val_rescaled > CONST_0, 16);
++    VEC_DATA_TYPE(int, 16) mask_if_zero       = ASYMM_MASK_IF_NON_ZERO(!input_val_rescaled, 16);
++    VEC_DATA_TYPE(int, 16) abs_input          = ASYMM_SELECT_USING_MASK(mask_if_positive, input_val_rescaled, -input_val_rescaled, 16);
++    VEC_DATA_TYPE(int, 16) result_exp         = ASYMM_EXP_ON_NEGATIVE_VALUES(-abs_input, INPUT_INTEGER_BITS, 16);
++    VEC_DATA_TYPE(int, 16) result_if_positive = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(result_exp, 16);
++    VEC_DATA_TYPE(int, 16) result_if_negative = Q0_one - result_if_positive;
++    VEC_DATA_TYPE(int, 16) result_logistic    = ASYMM_SELECT_USING_MASK(mask_if_zero, Q0_one_half, ASYMM_SELECT_USING_MASK(mask_if_positive, result_if_positive, result_if_negative, 16), 16);
++
++    result_logistic = ASYMM_ROUNDING_DIVIDE_BY_POW2(result_logistic, 23, 16);
++    result_logistic = ASYMM_SELECT_USING_MASK(result_logistic == 256, 255, result_logistic, 16);
++    result_logistic = result_logistic * input_mask;
++
++    result = result_left + result_right + result_logistic;
++#endif /* defined(INPUT_INTEGER_BITS) && defined(INPUT_RANGE_RADIUS) */
++
++    // Store result
++    TYPE tmp = CONVERT(result, TYPE);
++    VSTORE(VEC_SIZE)
++    (tmp, 0, (__global DATA_TYPE *)output.ptr);
++}
+diff --git a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+new file mode 100644
+index 0000000..0c0a9ed
+--- /dev/null
++++ b/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+@@ -0,0 +1,138 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016, 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers_asymm.h"
++
++#if defined(FIXED_POINT_POSITION)
++#include "fixed_point.h"
++#endif /* FIXED_POINT_POSITION */
++
++#ifdef SATURATE
++#define ADD(x, y) add_sat((x), (y))
++#define SUB(x, y) sub_sat((x), (y))
++#else /* SATURATE */
++#define ADD(x, y) (x) + (y)
++#define SUB(x, y) (x) - (y)
++#endif /* SATURATE */
++
++/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
++ *
++ * The following computations will be performed:
++ *
++ *  -# Add offset terms to inputs
++    -# Get scaled value of two inputs
++ *  -# Add inputs
++ *  -# Add offset terms to final result
++ *  -# Multiply each entry of result by result_mult_int
++ *  -# Shift the int32 accumulator by result_shift
++ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
++ *
++ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
++ * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT
++ * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT
++ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
++ *
++ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
++ * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
++ * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
++ * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
++ * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
++ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
++ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
++ *
++ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
++ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
++ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
++ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
++ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
++ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
++ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
++ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
++ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
++ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
++ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
++ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
++ */
++__kernel void arithmetic_add_qasymm8(
++    TENSOR3D_DECLARATION(in1),
++    TENSOR3D_DECLARATION(in2),
++    TENSOR3D_DECLARATION(out))
++{
++    // Get pixels pointer
++    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
++    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
++    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
++
++    // Load data
++    VEC_DATA_TYPE(int, 16)
++    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
++    VEC_DATA_TYPE(int, 16)
++    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
++
++    // Get scaled value of two inputs
++    VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
++    VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
++
++    VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
++    VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
++    VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
++
++    VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
++    VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
++
++    // Add inputs and multiply with a multiplier smaller than 1
++    VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
++    VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
++    out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
++
++    VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
++
++// TODO: Apply min-max BOUND to support fuse with relu.
++/*
++#if defined(MIN_BOUND)
++    res = max(res, (uchar16)MIN_BOUND);
++#endif // defined(MIN_BOUND)
++#if defined(MAX_BOUND)
++    res = min(res, (uchar16)MAX_BOUND);
++#endif // defined(MAX_BOUND)
++*/
++
++    // Store result
++    VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
++                     0, (__global DATA_TYPE_OUT *)out.ptr);
++}
+diff --git a/src/core/CL/cl_kernels/cast.cl b/src/core/CL/cl_kernels/cast.cl
+new file mode 100644
+index 0000000..113804c
+--- /dev/null
++++ b/src/core/CL/cl_kernels/cast.cl
+@@ -0,0 +1,148 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers.h"
++
++#ifndef SCALE_IN
++#define SCALE_IN    1.0f
++#endif
++#ifndef OFFSET_IN
++#define OFFSET_IN   0
++#endif
++
++/** Perform a cast operation on an input tensor.
++ *
++ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float
++ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
++ *
++ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
++ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
++ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
++ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
++ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
++ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
++ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
++ */
++__kernel void cast(
++    TENSOR3D_DECLARATION(input),
++    TENSOR3D_DECLARATION(output))
++{
++    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
++    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
++
++    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
++                             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
++                     0, (__global DATA_TYPE_OUT *)output.ptr);
++}
++
++
++/** Perform a cast operation on an QASYMM8 input tensor.
++ *
++ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
++ *
++ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
++ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
++ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
++ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
++ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
++ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
++ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
++ */
++__kernel void cast_qasymm_in(
++    TENSOR3D_DECLARATION(input),
++    TENSOR3D_DECLARATION(output))
++{
++    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
++    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
++
++    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
++        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
++    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
++    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
++
++    VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
++    VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
++
++    VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
++                     0, (__global DATA_TYPE_OUT *)output.ptr);
++}
++
++
++/** Perform a cast operation on an QASYMM8 output tensor.
++ *
++ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
++ *
++ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
++ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
++ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
++ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
++ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: U8
++ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
++ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
++ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
++ */
++__kernel void cast_qasymm_out(
++    TENSOR3D_DECLARATION(input),
++    TENSOR3D_DECLARATION(output))
++{
++    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
++    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
++
++    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
++        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
++    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
++    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
++
++    VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
++    VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
++
++    VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
++                     0, (__global DATA_TYPE_OUT *)output.ptr);
++}
+diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
+index 46fa645..e2f376b 100644
+--- a/src/core/CL/cl_kernels/fixed_point.h
++++ b/src/core/CL/cl_kernels/fixed_point.h
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+  * Copyright (c) 2017-2018 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+@@ -298,6 +299,29 @@ MLALQ_SAT_IMPL(qs16x8, qs32x8)
+ #define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) mlal_sat_##type##x##size((a), (b), (c), (position))
+ #define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+ 
++/* Division of two fixed point numbers
++ *
++ * @param[in] type  the actual data type.
++ * @param[in] itype the intermediate data type.
++ *
++ * @return The result of the fixed point division.
++ */
++#define DIVQ_IMPL(type, itype)                                                         \
++    inline type div_##type(type VopA, type VopB, int fixed_point_position)             \
++    {                                                                                  \
++        itype round_val = (itype)(1 << (fixed_point_position - 1));                    \
++        itype res       = CONVERT((VopA), itype) / CONVERT((VopB), itype) + round_val; \
++        return CONVERT((res >> (itype)fixed_point_position), type);                    \
++    }
++
++DIVQ_IMPL(qs8x8, qs16x8)
++DIVQ_IMPL(qs16x8, qs32x8)
++DIVQ_IMPL(qs8x16, qs16x16)
++DIVQ_IMPL(qs16x16, qs32x16)
++
++#define DIV_OP_EXPAND_STR(a, b, type, size, position) div_##type##x##size((a), (b), (position))
++#define DIV_OP_EXPAND(a, b, type, size, position) DIV_OP_EXPAND_STR(a, b, type, size, position)
++
+ /** Saturate division of two fixed point vectors
+  *
+  * @param[in] stype the actual scalar data type.
+diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/gather.cl
+new file mode 100644
+index 0000000..25e20f5
+--- /dev/null
++++ b/src/core/CL/cl_kernels/gather.cl
+@@ -0,0 +1,106 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers.h"
++
++/** Perform gather
++ *
++ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
++ *
++ * @param[in]  input1_ptr                            Pointer to the first source tensor. Supported data types: U8/S32/F32
++ * @param[in]  input1_stride_x                       Stride of the first source tensor in X dimension (in bytes)
++ * @param[in]  input1_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  input1_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
++ * @param[in]  input1_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  input1_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
++ * @param[in]  input1_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the first source tensor
++ * @param[in]  input2_ptr                            Pointer to the first source tensor. Supported data types: U32
++ * @param[in]  input2_stride_x                       Stride of the first source tensor in X dimension (in bytes)
++ * @param[in]  input2_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the first source tensor
++ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
++ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
++ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
++ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
++ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
++ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
++ */
++__kernel void gather(IMAGE_DECLARATION(input1),
++                    VECTOR_DECLARATION(input2),
++                    IMAGE_DECLARATION(output))
++{
++    Image in1  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
++    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
++    Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output);
++
++    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
++    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
++
++    //TODO: performance tuning for memcopy
++    int index = in2_data.s0;
++    int stride=input1_stride_y/input1_stride_x;
++
++    for(int i=0; i<stride; i++){
++        *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index));
++    }
++}
++
++__kernel void gather_1d_out(IMAGE_DECLARATION(input1),
++                    VECTOR_DECLARATION(input2),
++                    VECTOR_DECLARATION(output))
++{
++    Image in1  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
++    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
++    Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
++
++    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
++    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
++
++    //TODO: performance tuning for memcopy
++    int index = in2_data.s0;
++    int stride=input1_stride_y/input1_stride_x;
++
++    for(int i=0; i<stride; i++){
++        *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index));
++    }
++}
++
++__kernel void gather_1d(VECTOR_DECLARATION(input1),
++                    VECTOR_DECLARATION(input2),
++                    VECTOR_DECLARATION(output))
++{
++    Vector in1  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1);
++    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
++    Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
++
++    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
++    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
++
++    //TODO: performance tuning for memcopy
++    int index = in2_data.s0;
++    *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index));
++}
+diff --git a/src/core/CL/cl_kernels/pixelwise_div_float.cl b/src/core/CL/cl_kernels/pixelwise_div_float.cl
+new file mode 100644
+index 0000000..512c620
+--- /dev/null
++++ b/src/core/CL/cl_kernels/pixelwise_div_float.cl
+@@ -0,0 +1,96 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016, 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers.h"
++
++#ifdef SATURATE
++#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
++#else /* SATURATE */
++#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
++#endif /* SATURATE */
++#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
++
++/** Performs a pixelwise division with float scale of either integer or float inputs.
++ *
++ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
++ * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
++ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
++ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
++ *
++ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
++ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
++ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
++ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
++ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
++ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
++ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
++ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
++ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
++ * @param[in]  scale                             Float scaling factor. Supported data types: F32
++ */
++__kernel void pixelwise_div_float(
++    TENSOR3D_DECLARATION(in1),
++    TENSOR3D_DECLARATION(in2),
++    TENSOR3D_DECLARATION(out),
++    const float scale)
++{
++    // Get pixels pointer
++    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
++    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
++    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
++
++    // Load data
++    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
++    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
++    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
++    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
++
++    // Perform division
++#ifdef DATA_TYPE_FLOAT
++    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
++    res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
++#else  /* DATA_TYPE_FLOAT */
++    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
++    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
++#endif /* DATA_TYPE_FLOAT */
++
++    // Store result
++    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
++}
+diff --git a/src/core/CL/cl_kernels/pixelwise_div_int.cl b/src/core/CL/cl_kernels/pixelwise_div_int.cl
+new file mode 100644
+index 0000000..82edf3b
+--- /dev/null
++++ b/src/core/CL/cl_kernels/pixelwise_div_int.cl
+@@ -0,0 +1,103 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016, 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers.h"
++
++#if defined(FIXED_POINT_POSITION)
++
++#include "fixed_point.h"
++
++#if defined(SATURATE)
++#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
++#else // SATURATE
++#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
++#endif // SATURATE
++
++#else // FIXED_POINT_POSITION
++
++#if defined(SATURATE)
++#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
++#else // SATURATE
++#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
++#endif // SATURATE
++#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
++
++#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
++
++#endif // FIXED_POINT_POSITION
++
++/** Performs a pixelwise division with integer scale of integer inputs.
++ *
++ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
++ * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
++ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
++ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
++ *
++ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/QS8/QS16/S16
++ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
++ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
++ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
++ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
++ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
++ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
++ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
++ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
++ * @param[in]  scale                             Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
++ */
++__kernel void pixelwise_div_int(
++    TENSOR3D_DECLARATION(in1),
++    TENSOR3D_DECLARATION(in2),
++    TENSOR3D_DECLARATION(out),
++    const uint scale)
++{
++    // Get pixels pointer
++    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
++    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
++    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
++
++    // Load data
++    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
++    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
++    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
++    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
++
++    // Perform division and store result
++    vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
++}
+diff --git a/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+new file mode 100644
+index 0000000..ddc9d5a
+--- /dev/null
++++ b/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+@@ -0,0 +1,119 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016, 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers_asymm.h"
++
++#ifdef SATURATE
++#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
++#else /* SATURATE */
++#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
++#endif /* SATURATE */
++#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
++
++#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
++/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
++ *
++ * The following computations will be performed by the kernel:
++ *
++ *  -# Add offset terms to inputs
++ *  -# Multiply inputs
++ *  -# Add offset terms to final result
++ *  -# Multiply each entry of result by result_mult_int
++ *  -# Shift the int32 accumulator by result_shift
++ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
++ *
++ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
++ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
++ * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET
++ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
++ *
++ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
++ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
++ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
++ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
++ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
++ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
++ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
++ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
++ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
++ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
++ * @param[in]  scale                             Float scaling factor. Supported data types: F32
++ */
++__kernel void pixelwise_mul_qasymm8(
++    TENSOR3D_DECLARATION(in1),
++    TENSOR3D_DECLARATION(in2),
++    TENSOR3D_DECLARATION(out),
++    const float scale)
++{
++    // Get pixels pointer
++    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
++    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
++    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
++
++    // Load data
++    VEC_DATA_TYPE(int, 16)
++    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
++    VEC_DATA_TYPE(int, 16)
++    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
++
++    // Perform multiplication of two inputs
++    VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
++    VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
++    VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
++
++    // Multiply with a multiplier smaller than 1
++    out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
++    out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
++
++    VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
++
++// TODO: Apply min-max BOUND to support fuse with relu.
++/*
++#if defined(MIN_BOUND)
++    res = max(res, (uchar16)MIN_BOUND);
++#endif // defined(MIN_BOUND)
++#if defined(MAX_BOUND)
++    res = min(res, (uchar16)MAX_BOUND);
++#endif // defined(MAX_BOUND)
++*/
++
++    // Store result
++    VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
++                     0, (__global DATA_TYPE_OUT *)out.ptr);
++}
++#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+diff --git a/src/core/CL/cl_kernels/reduce_max.cl b/src/core/CL/cl_kernels/reduce_max.cl
+new file mode 100644
+index 0000000..dfa3b85
+--- /dev/null
++++ b/src/core/CL/cl_kernels/reduce_max.cl
+@@ -0,0 +1,60 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers.h"
++
++#if defined(WIDTH)
++/** Perform reduce max
++ *
++ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
++ *
++ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types:  F16/F32
++ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
++ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
++ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
++ * @param[out] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
++ * @param[out] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
++ */
++__kernel void reduce_max(VECTOR_DECLARATION(input),
++                         VECTOR_DECLARATION(output))
++{
++    Vector input = CONVERT_TO_VECTOR_STRUCT(input);
++    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
++
++    __global float *input_addr = (__global float *)(input.ptr);
++    __global float *output_addr = (__global float *)(output.ptr);
++
++    float max_value = *input_addr;
++    for(int x = 1; x < WIDTH; x++)
++    {
++        float value = *(input_addr + x);
++        max_value = max(value, max_value);
++    }
++
++    // Store max
++    *output_addr = max_value;
++}
++#endif // defined(WIDTH)
+diff --git a/src/core/CL/cl_kernels/reduction_mean.cl b/src/core/CL/cl_kernels/reduction_mean.cl
+new file mode 100644
+index 0000000..1a96eea
+--- /dev/null
++++ b/src/core/CL/cl_kernels/reduction_mean.cl
+@@ -0,0 +1,69 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016, 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers.h"
++
++inline DATA_TYPE sum_8(__global const DATA_TYPE *input)
++{
++    VEC_DATA_TYPE(DATA_TYPE, 8)
++    in = vload8(0, input);
++    in.s0123 += in.s4567;
++    in.s01 += in.s23;
++    return ((in.s0 + in.s1));
++}
++
++/** This function calculates the sum and sum of squares of a given input image.
++ *
++ * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
++ *
++ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
++ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
++ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
++ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
++ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
++ * @param[out] local_sum                         Local sum of all elements
++ * @param[in]  height                            Height of the input image
++ * @param[in]  divider                           Divider to calculate mean
++ */
++__kernel void reduction_mean(
++    IMAGE_DECLARATION(src),
++    IMAGE_DECLARATION(dst),
++    __local DATA_TYPE *local_sums, 
++    int height,
++    int divider)
++{
++    // Get pixels pointer
++    Image src = CONVERT_TO_IMAGE_STRUCT(src);
++    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
++
++    float8 tmp_sum = 0;
++    // Calculate partial sum
++
++    for(int i = 0; i < height; i++)
++    {
++        local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i));
++    }
++    ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider;
++}
+diff --git a/src/core/CL/cl_kernels/strided_slice.cl b/src/core/CL/cl_kernels/strided_slice.cl
+new file mode 100644
+index 0000000..c5ff82f
+--- /dev/null
++++ b/src/core/CL/cl_kernels/strided_slice.cl
+@@ -0,0 +1,104 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "helpers.h"
++
++
++inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w)
++{
++    int stride_x = vector->stride_x;
++    int stride_y = stride_x * dim_x;
++    int stride_z = stride_y * dim_y;
++    int stride_w = stride_z * dim_z;
++    Tensor4D tensor =
++    {
++        .ptr                           = vector->ptr,
++        .offset_first_element_in_bytes = vector->offset_first_element_in_bytes,
++        .stride_x                      = stride_x,
++        .stride_y                      = stride_y,
++        .stride_z                      = stride_z,
++        .stride_w                      = stride_w,
++    };
++    return tensor;
++}
++
++/** Extracts a strided slice up to 4-dimensions
++ *
++ * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
++ * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2
++ *
++ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
++ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
++ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
++ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
++ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
++ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
++ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
++ * @param[in]  dims_in                              The 4-dimensional dimension of the input. Supported data types: S32
++ * @param[in]  dims_out                             The 4-dimensional dimension of the output. Supported data types: S32
++ * @param[in]  starts                               The stride of X dimension of input tensor to be sliced. Supported data types: S32
++ * @param[in]  strides                              The stride of Y dimension of input tensor to be sliced. Supported data types: S32
++ */
++__kernel void strided_slice(VECTOR_DECLARATION(input),
++                            VECTOR_DECLARATION(output),
++                            const int4 dims_in,
++                            const int4 dims_out,
++                            const int4 starts,
++                            const int4 strides)
++{
++    // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset
++    Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
++    Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
++
++    // Implemenation
++    // Infer a Tensor4D from output Vector and output's dimensions info
++    // Infer a Tensor4D from input Vector and input's dimensions info
++    // Infer indices of output as 4D from the offset of output vector
++    // Infer indices of input as 4D from indices of output
++    // out(offset of output vector) = in(offset of input)
++
++    Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
++    Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w);
++
++    // Must be output_step_x == output_stride_x == an element's size
++    const int offset_out = get_global_id(0) * output_stride_x;
++    int4 indices_out =
++    {
++            get_global_id(0) % dims_out.x,
++            (offset_out / tensor_out.stride_y) % dims_out.y,
++            (offset_out / tensor_out.stride_z) % dims_out.z,
++            (offset_out / tensor_out.stride_w) % dims_out.w,
++    };
++
++    int4 indices_in =
++    {
++            starts.x + (strides.x * indices_out.x),
++            starts.y + (strides.y * indices_out.y),
++            starts.z + (strides.z * indices_out.z),
++            starts.w + (strides.w * indices_out.w),
++    };
++
++    *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
++}
+diff --git a/src/core/CL/cl_kernels/topkv2.cl b/src/core/CL/cl_kernels/topkv2.cl
+new file mode 100644
+index 0000000..0b0cf82
+--- /dev/null
++++ b/src/core/CL/cl_kernels/topkv2.cl
+@@ -0,0 +1,111 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "helpers.h"
++
++__kernel void topkv2_init(VECTOR_DECLARATION(input),
++    __global float* in_key_buf,
++    __global int* in_ind_buf,
++    const int n)
++{
++  int gid = get_global_id(0);
++  int lws = get_local_size(0);
++  int groups = get_num_groups(0);
++  int gws = lws * groups;
++  int iter = n / gws;
++
++  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
++
++  for(int i = 0; i < iter; ++i)
++  {
++    int idx = i * gws + gid;
++    in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x);
++    in_ind_buf[idx] = idx;
++  }
++}
++
++__kernel void topkv2_find_first_negative(
++    __global float *out_key_buf,
++    __global int *first_negative_idx,
++    int n)
++{
++  int gid = get_global_id(0);
++
++  if( gid == n - 1 )
++  {
++    // if the last item is positive, the first negative index is n.
++    if( out_key_buf[gid] > 0.f )
++      *first_negative_idx = n;
++  } else if ( gid == 0 ) {
++    // if the first item is negative, set it 0.
++    if( out_key_buf[gid] < 0.f )
++      *first_negative_idx = 0;
++  } else {
++    // if its left is positive and it is negative, then it is the first negative item.
++    if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f )
++      *first_negative_idx = gid;
++  }
++}
++
++__kernel void topkv2_reorder_negatives(
++    __global float* in_key_buf,
++    __global float* out_key_buf,
++    __global float* in_ind_buf,
++    __global float* out_ind_buf,
++    __global int* first_negative_idx,
++    int n)
++{
++  int gid = get_global_id(0);
++
++  int num_negs = n - *first_negative_idx;
++  int in_idx;
++
++  if( gid < num_negs ) {
++    in_idx = n - 1 - gid;
++  } else {
++    in_idx = gid - num_negs;
++  }
++
++  out_key_buf[gid] = in_key_buf[in_idx];
++  out_ind_buf[gid] = in_ind_buf[in_idx];
++}
++
++__kernel void topkv2_store(
++    VECTOR_DECLARATION(values),
++    VECTOR_DECLARATION(indices),
++    __global float *out_key_buf,
++    __global int *out_ind_buf,
++    int n)
++{
++  int gid = get_global_id(0);
++
++  Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
++  Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
++
++  int idx = n - 1 - gid;
++
++  *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
++  *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
++}
+diff --git a/src/core/CL/cl_kernels/topkv2_quicksort.cl b/src/core/CL/cl_kernels/topkv2_quicksort.cl
+new file mode 100644
+index 0000000..deadf84
+--- /dev/null
++++ b/src/core/CL/cl_kernels/topkv2_quicksort.cl
+@@ -0,0 +1,138 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "helpers.h"
++
++__global inline float* get_vec_elem(Vector* vec, int idx)
++{
++  return (__global float*)(vec->ptr + idx * vec->stride_x);
++}
++
++__global inline int* get_vec_elem_int(Vector* vec, int idx)
++{
++  return (__global int*)(vec->ptr + idx * vec->stride_x);
++}
++
++// A utility function to swap two elements
++void swap(__global float *a, __global float *b)
++{
++    float t = *a;
++    *a = *b;
++    *b = t;
++}
++
++void swap_idx(__global int *a, __global int *b)
++{
++    int t = *a;
++    *a = *b;
++    *b = t;
++}
++
++/* This function is same in both iterative and recursive*/
++int partition (Vector* arr, __global int* indices, int l, int h)
++{
++    float x = *get_vec_elem(arr, h);
++    int i = (l - 1);
++
++    for (int j = l; j <= h- 1; j++)
++    {
++        if (*get_vec_elem(arr, j) >= x)
++        {
++            i++;
++            swap (get_vec_elem(arr,i), get_vec_elem(arr,j));
++            swap_idx(&indices[i], &indices[j]);
++        }
++    }
++    swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
++    swap_idx(&indices[i + 1], &indices[h]);
++    return (i + 1);
++}
++
++/* A[] --> Array to be sorted,
++   l  --> Starting index,
++   h  --> Ending index */
++void quickSortIterative (Vector* arr, __global int* indices, 
++    __global int *stack, int l, int h)
++{
++    // Create an auxiliary stack
++
++    // initialize top of stack
++    int top = -1;
++
++    // push initial values of l and h to stack
++    stack[ ++top ] = l;
++    stack[ ++top ] = h;
++
++    // Keep popping from stack while is not empty
++    while ( top >= 0 )
++    {
++        // Pop h and l
++        h = stack[ top-- ];
++        l = stack[ top-- ];
++
++        // Set pivot element at its correct position
++        // in sorted array
++        int p = partition( arr, indices, l, h );
++
++        // If there are elements on left side of pivot,
++        // then push left side to stack
++        if ( p-1 > l )
++        {
++            stack[ ++top ] = l;
++            stack[ ++top ] = p - 1;
++        }
++
++        // If there are elements on right side of pivot,
++        // then push right side to stack
++        if ( p+1 < h )
++        {
++            stack[ ++top ] = p + 1;
++            stack[ ++top ] = h;
++        }
++    }
++}
++
++__kernel void topkv2_quicksort(VECTOR_DECLARATION(input),
++    VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices),
++    __global int* indices, __global int* temp_stack, int k, int n)
++{
++  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
++  Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
++  Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
++
++  for( int i = 0; i < n; ++i )
++  {
++    indices[i] = i;
++  }
++
++  quickSortIterative(&input, indices, temp_stack, 0, n-1);
++
++  // extract k items.
++  for(int i = 0; i < k; ++i)
++  {
++    *get_vec_elem(&topk_values, i)  = *get_vec_elem(&input, i);
++    *get_vec_elem_int(&topk_indices, i) = indices[i];
++  }
++}
+diff --git a/src/core/CL/cl_kernels/topkv2_radixsort.cl b/src/core/CL/cl_kernels/topkv2_radixsort.cl
+new file mode 100644
+index 0000000..cac0c07
+--- /dev/null
++++ b/src/core/CL/cl_kernels/topkv2_radixsort.cl
+@@ -0,0 +1,279 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++// reference:
++// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
++// OpenCL kernel sources for the CLRadixSort class
++// the #include does not exist in OpenCL
++// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
++// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
++// if you find this software usefull you can cite the following work in your reports or articles:
++// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
++// http://hal.archives-ouvertes.fr/hal-00596730
++
++// Reference for floating point radix sort:
++// http://www.codercorner.com/RadixSortRevisited.htm
++
++// compute the histogram for each radix and each virtual processor for the pass
++__kernel void radixsort_histogram(__global float* in_key_buf,
++      __global int* d_Histograms,
++      const int pass,
++      __local int* loc_histo,
++      const int n)
++{
++  int it = get_local_id(0);  // i local number of the processor
++  int ig = get_global_id(0); // global number = i + g I
++
++  int gr = get_group_id(0); // g group number
++
++  int groups = get_num_groups(0);
++  int items  = get_local_size(0);
++
++  // set the local histograms to zero
++  for(int ir=0;ir<_RADIX;ir++){
++    loc_histo[ir * items + it] = 0;
++  }
++
++  barrier(CLK_LOCAL_MEM_FENCE);
++
++  // range of keys that are analyzed by the work item
++  int size= n/groups/items; // size of the sub-list
++  int start= ig * size; // beginning of the sub-list
++
++  unsigned int key;
++  int shortkey,k;
++
++  // compute the index
++  // the computation depends on the transposition
++  for(int j = 0; j < size ; j++) {
++#ifdef TRANSPOSE
++    k= groups * items * j + ig;
++#else
++    k=j+start;
++#endif
++
++    key = *((__global unsigned int*)(in_key_buf + k));
++
++    // extract the group of _BITS bits of the pass
++    // the result is in the range 0.._RADIX-1
++    shortkey=(( key >> (pass * _BITS)) & (_RADIX-1));
++
++    // increment the local histogram
++    loc_histo[shortkey *  items + it ]++;
++  }
++
++  barrier(CLK_LOCAL_MEM_FENCE);
++
++  // copy the local histogram to the global one
++  for(int ir=0;ir<_RADIX;ir++) {
++    d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
++  }
++
++  barrier(CLK_GLOBAL_MEM_FENCE);
++}
++
++// initial transpose of the list for improving
++// coalescent memory access
++__kernel void transpose(const __global int* invect,
++      __global int* outvect,
++      const int nbcol,
++      const int nbrow,
++      const __global int* inperm,
++      __global int* outperm,
++      __local int* blockmat,
++      __local int* blockperm,
++      const int tilesize){
++
++  int i0 = get_global_id(0)*tilesize;  // first row index
++  int j = get_global_id(1);  // column index
++
++  int jloc = get_local_id(1);  // local column index
++
++  // fill the cache
++  for(int iloc=0;iloc<tilesize;iloc++){
++    int k=(i0+iloc)*nbcol+j;  // position in the matrix
++    blockmat[iloc*tilesize+jloc]=invect[k];
++#ifdef PERMUT
++    blockperm[iloc*tilesize+jloc]=inperm[k];
++#endif
++  }
++
++  barrier(CLK_LOCAL_MEM_FENCE);
++
++  // first row index in the transpose
++  int j0=get_group_id(1)*tilesize;
++
++  // put the cache at the good place
++  for(int iloc=0;iloc<tilesize;iloc++){
++    int kt=(j0+iloc)*nbrow+i0+jloc;  // position in the transpose
++    outvect[kt]=blockmat[jloc*tilesize+iloc];
++#ifdef PERMUT
++      outperm[kt]=blockperm[jloc*tilesize+iloc];
++#endif
++  }
++
++}
++
++// each virtual processor reorders its data using the scanned histogram
++__kernel void radixsort_reorder(__global float* in_key,
++    __global float* out_key,
++    __global int* d_Histograms,
++    const int pass,
++    __global int* indices_in,
++    __global int* indices_out,
++    __local int* loc_histo,
++    const int n){
++
++  int it = get_local_id(0);
++  int ig = get_global_id(0);
++
++  int gr = get_group_id(0);
++  int groups=get_num_groups(0);
++  int items=get_local_size(0);
++
++  int start= ig *(n/groups/items);
++  int size= n/groups/items;
++
++  // take the histogram in the cache
++  for(int ir=0;ir<_RADIX;ir++){
++    loc_histo[ir * items + it]=
++      d_Histograms[items * (ir * groups + gr) + it];
++  }
++  barrier(CLK_LOCAL_MEM_FENCE);
++
++  int newpos,shortkey,k,newpost;
++  unsigned int key;
++
++  for(int j= 0; j< size;j++){
++#ifdef TRANSPOSE
++      k= groups * items * j + ig;
++#else
++      k=j+start;
++#endif
++    float org_value = in_key[k];
++    key = *(__global unsigned int*)(in_key + k);
++    shortkey=((key >> (pass * _BITS)) & (_RADIX-1));
++
++    newpos=loc_histo[shortkey * items + it];
++
++#ifdef TRANSPOSE
++    int ignew,jnew;
++    ignew= newpos/(n/groups/items);
++    jnew = newpos%(n/groups/items);
++    newpost = jnew * (groups*items) + ignew;
++#else
++    newpost=newpos;
++#endif
++
++    //d_outKeys[newpost]= key;  // killing line !!!
++    out_key[newpost] = org_value;
++
++#ifdef PERMUT
++    indices_out[newpost] = indices_in[k];
++#endif
++
++    newpos++;
++    loc_histo[shortkey * items + it]=newpos;
++  }
++}
++
++// perform a parallel prefix sum (a scan) on the local histograms
++// (see Blelloch 1990) each workitem worries about two memories
++// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
++__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum)
++{
++  int it = get_local_id(0);
++  int ig = get_global_id(0);
++  int decale = 1;
++  int n=get_local_size(0) * 2 ;
++  int gr=get_group_id(0);
++
++  // load input into local memory
++  // up sweep phase
++  temp[2*it] = histo[2*ig];
++  temp[2*it+1] = histo[2*ig+1];
++
++  // parallel prefix sum (algorithm of Blelloch 1990)
++  for (int d = n>>1; d > 0; d >>= 1){
++    barrier(CLK_LOCAL_MEM_FENCE);
++    if (it < d){
++      int ai = decale*(2*it+1)-1;
++      int bi = decale*(2*it+2)-1;
++      temp[bi] += temp[ai];
++    }
++    decale *= 2;
++  }
++
++  // store the last element in the global sum vector
++  // (maybe used in the next step for constructing the global scan)
++  // clear the last element
++  if (it == 0) {
++    globsum[gr]=temp[n-1];
++    temp[n - 1] = 0;
++  }
++
++  // down sweep phase
++  for (int d = 1; d < n; d *= 2){
++    decale >>= 1;
++    barrier(CLK_LOCAL_MEM_FENCE);
++
++    if (it < d){
++      int ai = decale*(2*it+1)-1;
++      int bi = decale*(2*it+2)-1;
++
++      int t = temp[ai];
++      temp[ai] = temp[bi];
++      temp[bi] += t;
++    }
++
++  }
++  barrier(CLK_LOCAL_MEM_FENCE);
++
++  // write results to device memory
++
++  histo[2*ig] = temp[2*it];
++  histo[2*ig+1] = temp[2*it+1];
++
++  barrier(CLK_GLOBAL_MEM_FENCE);
++
++}
++
++// use the global sum for updating the local histograms
++// each work item updates two values
++__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum)
++{
++  int ig = get_global_id(0);
++  int gr=get_group_id(0);
++
++  int s;
++
++  s=globsum[gr];
++
++  // write results to device memory
++  histo[2*ig] += s;
++  histo[2*ig+1] += s;
++
++  barrier(CLK_GLOBAL_MEM_FENCE);
++}
+diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
+index a78b3e1..4c3ecad 100644
+--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
++++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
+@@ -33,6 +33,7 @@
+ #include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/Window.h"
++#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ 
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/Types.h"
+@@ -49,8 +50,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
+     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+     ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                                     && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU),
+-                                    "For QASYMM8 only relu, lower bounded relu and lower-upper bounded relu are supported");
++                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
++                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
++                                    "For QASYMM8 only relu, lower bounded relu, lower-upper bounded relu and logistic are supported");
+ 
+     // Checks performed when output is configured
+     if((output != nullptr) && (output->total_size() != 0))
+@@ -93,6 +95,43 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
+     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+     return std::make_pair(err, win);
+ }
++
++inline bool is_activation_logistic(ActivationLayerInfo &act_info)
++{
++    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
++    {
++        return true;
++    }
++    return false;
++}
++
++/** Calculates logistic parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
++ *
++ * Prepares these build options:
++ * -INPUT_MULTIPLIER, INPUT_LEFT_SHIFT - quantized representation of multiplier.
++ * -INPUT_RANGE_RADIUS - threshold difference between maximum value of input data and current processed value.
++ *                       it defines whether the value will be taken into account or not.
++ *
++ * @param[in] build_opts  Build options to extend
++ * @param[in] input_scale Input scaling factor
++ */
++void prepare_quantized_logistic_build_options(std::set<std::string> *build_opts, float input_scale)
++{
++    // Number of integer bits in temporary fixed-point representation of current-to-max difference
++    static const int input_integer_bits = 4;
++
++    const double input_real_multiplier = input_scale * (1ll << (31 - input_integer_bits));
++    int input_multiplier, input_left_shift;
++    quantization::calculate_quantized_multiplier_greater_than_one(input_real_multiplier, &input_multiplier, &input_left_shift);
++
++    const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) * (1ll << (31 - input_integer_bits)) / (1ll << input_left_shift);
++    const int    input_range_radius = std::floor(max_input_rescaled);
++
++    build_opts->emplace(("-DINPUT_INTEGER_BITS=" + support::cpp11::to_string(input_integer_bits)));
++    build_opts->emplace(("-DINPUT_MULTIPLIER=" + support::cpp11::to_string(input_multiplier)));
++    build_opts->emplace(("-DINPUT_LEFT_SHIFT=" + support::cpp11::to_string(input_left_shift)));
++    build_opts->emplace(("-DINPUT_RANGE_RADIUS=" + support::cpp11::to_string(input_range_radius)));
++}
+ } // namespace
+ 
+ CLActivationLayerKernel::CLActivationLayerKernel()
+@@ -181,8 +220,16 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
+         build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fixed_point_position)));
+     }
+ 
++    if(is_data_type_quantized_asymmetric(dt) && is_activation_logistic(act_info))
++    {
++        prepare_quantized_logistic_build_options(&build_opts, input->info()->quantization_info().scale);
++    }
++
+     // Create kernel
+-    std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
++    std::string kernel_name = is_data_type_quantized_asymmetric(dt) && is_activation_logistic(act_info) ?
++                              std::string("activation_layer_logistic_qa8") :
++                              is_data_type_quantized_asymmetric(dt) ?
++                              std::string("activation_layer_qa8") : std::string("activation_layer");
+     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ 
+     // Make sure _kernel is initialized before calling the parent's configure
+diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+index c4904ec..f5f4f1a 100644
+--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
++++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+@@ -25,6 +25,7 @@
+ 
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ 
+ using namespace arm_compute;
+ 
+@@ -36,8 +37,13 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
+ {
+     ARM_COMPUTE_UNUSED(policy);
+ 
+-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++    if (is_data_type_quantized_asymmetric(output.data_type()))
++    {
++        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output, &input1);
++        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output, &input2);
++    }
+ 
+     const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+ 
+@@ -47,7 +53,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
+     // Validate in case of configured output
+     if(output.total_size() > 0)
+     {
+-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+         ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
+                                         "Output can only be U8 if both inputs are U8");
+         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+@@ -132,8 +138,40 @@ void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTen
+         build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
+     }
+ 
++    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
++    {
++        const int left_shift = 20;
++        const double twice_max_input_scale = 2 * std::max(input1->info()->quantization_info().scale, input2->info()->quantization_info().scale);
++        const double real_input1_multiplier = input1->info()->quantization_info().scale / twice_max_input_scale;
++        const double real_input2_multiplier = input2->info()->quantization_info().scale / twice_max_input_scale;
++        const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output->info()->quantization_info().scale);
++
++        int input1_multiplier, input2_multiplier, output_multiplier;
++        int input1_shift, input2_shift, output_shift;
++        quantization::calculate_quantized_multiplier_less_than_one(real_input1_multiplier, &input1_multiplier, &input1_shift);
++        quantization::calculate_quantized_multiplier_less_than_one(real_input2_multiplier, &input2_multiplier, &input2_shift);
++        quantization::calculate_quantized_multiplier_less_than_one(real_output_multiplier, &output_multiplier, &output_shift);
++
++        build_opts.emplace("-DIN1_MULT_INT=" + support::cpp11::to_string(input1_multiplier));
++        build_opts.emplace("-DIN2_MULT_INT=" + support::cpp11::to_string(input2_multiplier));
++        build_opts.emplace("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_multiplier));
++        build_opts.emplace("-DLEFT_SHIFT=" + support::cpp11::to_string(left_shift));
++        build_opts.emplace("-DIN1_SHIFT=" + support::cpp11::to_string(input1_shift));
++        build_opts.emplace("-DIN2_SHIFT=" + support::cpp11::to_string(input2_shift));
++        build_opts.emplace("-DRESULT_SHIFT=" + support::cpp11::to_string(output_shift));
++        build_opts.emplace("-DIN1_OFFSET=" + support::cpp11::to_string(-(input1->info()->quantization_info().offset)));
++        build_opts.emplace("-DIN2_OFFSET=" + support::cpp11::to_string(-(input2->info()->quantization_info().offset)));
++        build_opts.emplace("-DRESULT_OFFSET=" + support::cpp11::to_string(output->info()->quantization_info().offset));
++        // TODO: Apply min-max BOUND to support fuse with relu.
++    }
++
+     // Create kernel
+-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
++    std::string kernel_name = "arithmetic_add";
++    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
++    {
++        kernel_name += "_qasymm8";
++    }
++    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ 
+     ICLKernel::configure(win_config.second);
+ }
+diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+index 8308aa0..3053222 100644
+--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
++++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+@@ -1,5 +1,6 @@
+ /*
+- * Copyright (c) 2016, 2017 ARM Limited.
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+  *
+@@ -24,37 +25,33 @@
+ #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+ 
+ #include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibrary.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
+-#include "arm_compute/core/CL/OpenCL.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/IAccessWindow.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/Window.h"
+-
+-#include <set>
+-#include <string>
+ 
+ using namespace arm_compute;
+ 
+ namespace
+ {
++constexpr unsigned int num_elems_processed_per_iteration = 16;
++
+ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+ {
+     ARM_COMPUTE_UNUSED(policy);
++
+     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
++
++    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
++
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
+ 
+     // Validate in case of configured output
+-    if((output != nullptr) && (output->total_size() != 0))
++    if(output->total_size() > 0)
+     {
+         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+                                         "Output can only be U8 if both inputs are U8");
+-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
++        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
+     }
+ 
+@@ -63,17 +60,39 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+ 
+ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+ {
+-    constexpr unsigned int num_elems_processed_per_iteration = 16;
++    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
++    const TensorShape &out_shape    = broadcast_pair.first;
++    const ValidRegion &valid_region = broadcast_pair.second;
++
++    // Auto initialize output if not initialized
++    {
++        set_shape_if_empty(*output, out_shape);
++
++        if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
++        {
++            set_format_if_unknown(*output, Format::S16);
++        }
++        else if(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
++        {
++            set_format_if_unknown(*output, Format::F16);
++        }
++        else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
++        {
++            set_format_if_unknown(*output, Format::F32);
++        }
++    }
++
++    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
++    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
++    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+ 
+-    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+     AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+     AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ 
+-    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+-
+-    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
+-                                                       input2->valid_region());
++    bool window_changed = update_window_and_padding(win_input1, input1_access)
++                          || update_window_and_padding(win_input2, input2_access)
++                          || update_window_and_padding(win, output_access);
+ 
+     output_access.set_valid_region(win, valid_region);
+ 
+@@ -90,28 +109,17 @@ CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel()
+ void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+ {
+     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+-
+-    // Auto initialize output if not initialized
+-    {
+-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+-
+-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+-        {
+-            set_format_if_unknown(*output->info(), Format::S16);
+-        }
+-        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+-        {
+-            set_format_if_unknown(*output->info(), Format::F32);
+-        }
+-    }
+-
+     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
+ 
++    // Configure kernel window
++    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
++    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
++
+     _input1 = input1;
+     _input2 = input2;
+     _output = output;
+ 
+-    bool has_float_out = is_data_type_float(output->info()->data_type());
++    const bool has_float_out = is_data_type_float(output->info()->data_type());
+ 
+     // Set kernel build options
+     std::set<std::string> build_opts;
+@@ -127,14 +135,12 @@ void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICL
+     // Create kernel
+     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
+ 
+-    // Configure kernel window
+-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+     ICLKernel::configure(win_config.second);
+ }
+ 
+ Status CLArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+ {
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+ 
+@@ -146,16 +152,49 @@ void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &
+     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+ 
+-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+-    Window slice     = collapsed.first_slice_window_3D();
++    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
++    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
++    const TensorShape &out_shape = _output->info()->tensor_shape();
++
++    bool can_collapse = true;
++    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
++    {
++        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
++        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
++        {
++            can_collapse = (in_shape1[d] == in_shape2[d]);
++        }
++    }
++
++    bool   has_collapsed = false;
++    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
++
++    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
++    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
++
++    Window slice        = collapsed.first_slice_window_3D();
++    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
++    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+ 
+     do
+     {
+         unsigned int idx = 0;
+-        add_3D_tensor_argument(idx, _input1, slice);
+-        add_3D_tensor_argument(idx, _input2, slice);
++
++        add_3D_tensor_argument(idx, _input1, slice_input1);
++        add_3D_tensor_argument(idx, _input2, slice_input2);
+         add_3D_tensor_argument(idx, _output, slice);
++
+         enqueue(queue, *this, slice);
++
++        collapsed.slide_window_slice_3D(slice_input1);
++        collapsed.slide_window_slice_3D(slice_input2);
+     }
+     while(collapsed.slide_window_slice_3D(slice));
+ }
++
++BorderSize CLArithmeticSubtractionKernel::border_size() const
++{
++    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
++    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
++    return BorderSize(0, border, 0, 0);
++}
+diff --git a/src/core/CL/kernels/CLCastKernel.cpp b/src/core/CL/kernels/CLCastKernel.cpp
+new file mode 100644
+index 0000000..204ae74
+--- /dev/null
++++ b/src/core/CL/kernels/CLCastKernel.cpp
+@@ -0,0 +1,115 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/core/CL/kernels/CLCastKernel.h"
++
++#include "arm_compute/core/CL/CLHelpers.h"
++#include "arm_compute/core/CL/CLKernelLibrary.h"
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/IAccessWindow.h"
++#include "arm_compute/core/TensorInfo.h"
++#include "arm_compute/core/Utils.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/Window.h"
++
++using namespace arm_compute;
++
++CLCastKernel::CLCastKernel()
++    : _input(nullptr), _output(nullptr)
++{
++}
++
++void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
++
++    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
++        DataType::S16, DataType::S32, DataType::F16, DataType::F32);
++    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
++        DataType::S16, DataType::S32, DataType::F16, DataType::F32);
++    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
++
++    _input = input;
++    _output = output;
++
++    constexpr unsigned int num_elems_processed_per_iteration = 16;
++
++    // Set kernel build options
++    std::set<std::string> build_opts;
++    build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
++    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
++    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
++
++    // Create kernel
++    if (is_data_type_quantized_asymmetric(input->info()->data_type()))
++    {
++      const float scale_in = input->info()->quantization_info().scale;
++      const int offset_in = input->info()->quantization_info().offset;
++      build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
++      build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
++
++      _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("cast_qasymm_in", build_opts));
++    }
++    else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
++    {
++      const float scale_in = output->info()->quantization_info().scale;
++      const int offset_in = output->info()->quantization_info().offset;
++      build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
++      build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
++
++      _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("cast_qasymm_out", build_opts));
++    }
++    else
++    {
++      _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("cast", build_opts));
++    }
++
++    // Configure kernel window
++    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
++    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
++    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
++    update_window_and_padding(win, input_access, output_access);
++    output_access.set_valid_region(win, input->info()->valid_region());
++
++    ICLKernel::configure(win);
++}
++
++void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
++{
++    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
++
++    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
++    Window slice     = collapsed.first_slice_window_3D();
++
++    do
++    {
++        unsigned int idx = 0;
++        add_3D_tensor_argument(idx, _input, slice);
++        add_3D_tensor_argument(idx, _output, slice);
++        enqueue(queue, *this, slice);
++    }
++    while(collapsed.slide_window_slice_3D(slice));
++}
+diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
+new file mode 100644
+index 0000000..0a83008
+--- /dev/null
++++ b/src/core/CL/kernels/CLGatherKernel.cpp
+@@ -0,0 +1,147 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
++
++#include "arm_compute/core/CL/CLHelpers.h"
++#include "arm_compute/core/CL/CLKernelLibrary.h"
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/OpenCL.h"
++#include "arm_compute/core/Error.h"
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/TensorInfo.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/Window.h"
++
++#include <cmath>
++#include <cstdlib>
++#include <set>
++#include <string>
++
++using namespace arm_compute;
++
++namespace
++{
++constexpr unsigned int num_elems_processed_per_iteration = 16;
++
++Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
++{
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32, DataType::F32);
++
++    return Status{};
++}
++
++} // namespace
++
++CLGatherKernel::CLGatherKernel()
++    : _input1(nullptr), _input2(nullptr), _output(nullptr)
++{
++}
++
++void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
++    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
++    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
++
++    _input1 = input1;
++    _input2 = input2;
++    _output = output;
++
++    // Construct kernel name
++    std::string kernel_name = "gather";
++    if (input1->info()->num_dimensions()==1)
++    {
++        kernel_name = "gather_1d";
++    }
++    else if (input1->info()->num_dimensions()==2)
++    {
++        if(_output->info()->num_dimensions()==1)
++        {
++            kernel_name = "gather_1d_out";
++        }
++    }
++
++    // Set kernel build options
++    std::set<std::string> build_opts;
++    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
++    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
++    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
++
++    // Create kernel
++    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
++
++    // Configure kernel window
++    const unsigned int num_elems_processed_per_iteration = 1;
++    Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
++    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
++
++    ICLKernel::configure(win);
++}
++
++Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
++    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
++
++    return Status{};
++}
++
++void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
++{
++    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++    if (_input1->info()->num_dimensions()==1)
++    {
++        Window slice = window.first_slice_window_1D();
++
++        unsigned int idx = 0;
++        add_1D_tensor_argument(idx, _input1, slice);
++        add_1D_tensor_argument(idx, _input2, slice);
++        add_1D_tensor_argument(idx, _output, slice);
++        enqueue(queue, *this, slice);
++    }
++    else if (_input1->info()->num_dimensions()==2)
++    {
++        Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
++        Window slice            = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
++
++        //Set inputs
++        unsigned int idx = 0;
++        add_2D_tensor_argument(idx, _input1, window_collapsed);
++        add_1D_tensor_argument(idx, _input2, slice);
++        if(_output->info()->num_dimensions()==1)
++        {
++            add_1D_tensor_argument(idx, _output, slice);
++        }
++        else
++        {
++            add_2D_tensor_argument(idx, _output, window_collapsed);
++        }
++        enqueue(queue, *this, slice);
++    }
++}
+diff --git a/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+new file mode 100644
+index 0000000..26cb3e2
+--- /dev/null
++++ b/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+@@ -0,0 +1,284 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
++
++#include "arm_compute/core/CL/CLHelpers.h"
++#include "arm_compute/core/CL/CLKernelLibrary.h"
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/OpenCL.h"
++#include "arm_compute/core/Error.h"
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/TensorInfo.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/Window.h"
++
++#include <cmath>
++#include <cstdlib>
++#include <set>
++#include <string>
++
++using namespace arm_compute;
++
++namespace
++{
++constexpr unsigned int num_elems_processed_per_iteration = 16;
++
++Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
++                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
++{
++    ARM_COMPUTE_UNUSED(overflow_policy);
++    ARM_COMPUTE_UNUSED(rounding_policy);
++
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
++
++    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
++
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
++    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
++
++    if(is_data_type_fixed_point(input1->data_type()))
++    {
++        // All data types must be all QS8 or all QS16
++        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
++        ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1, "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
++    }
++
++    // Validate in case of configured output
++    if(output->total_size() > 0)
++    {
++        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
++                                        "Output can only be U8 if both inputs are U8");
++        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
++        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
++        if(is_data_type_fixed_point(input1->data_type()))
++        {
++            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
++        }
++    }
++
++    return Status{};
++}
++
++std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
++{
++    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
++    const TensorShape &out_shape    = broadcast_pair.first;
++    const ValidRegion &valid_region = broadcast_pair.second;
++
++    // Auto initialize output if not initialized
++    {
++        set_shape_if_empty(*output, out_shape);
++
++        if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
++        {
++            set_format_if_unknown(*output, Format::S16);
++        }
++        else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
++        {
++            set_format_if_unknown(*output, Format::F32);
++        }
++    }
++
++    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
++    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
++    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
++
++    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
++    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
++    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
++
++    bool window_changed = update_window_and_padding(win_input1, input1_access)
++                          || update_window_and_padding(win_input2, input2_access)
++                          || update_window_and_padding(win, output_access);
++
++    output_access.set_valid_region(win, valid_region);
++
++    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
++    return std::make_pair(err, win);
++}
++} // namespace
++
++CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel()
++    : _input1(nullptr), _input2(nullptr), _output(nullptr)
++{
++}
++
++void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
++                                                ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
++    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
++                                                  scale, overflow_policy, rounding_policy));
++
++    // Configure kernel window
++    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
++    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
++
++    _input1 = input1;
++    _input2 = input2;
++    _output = output;
++
++    int scale_int = -1;
++    // Extract sign, exponent and mantissa
++    int   exponent            = 0;
++    float normalized_mantissa = std::frexp(scale, &exponent);
++    // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
++    // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
++    // Moreover, it will be negative as we deal with 1/2^n
++    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
++    {
++        // Store the positive exponent. We know that we compute 1/2^n
++        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
++        scale_int = std::abs(exponent - 1);
++    }
++
++    std::string data_type;
++    std::string compute_type;
++    // Check if it has float inputs and output
++    if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
++    {
++        scale_int    = -1;
++        compute_type = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
++        data_type    = "DATA_TYPE_FLOAT";
++    }
++    else
++    {
++        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
++        {
++            compute_type = "int";
++        }
++        else if(input1->info()->data_type() == DataType::QS8)
++        {
++            compute_type = "qs8";
++        }
++        else if(input1->info()->data_type() == DataType::QS16)
++        {
++            compute_type = "qs16";
++        }
++        else
++        {
++            compute_type = "ushort";
++        }
++        data_type = "DATA_TYPE_INT";
++    }
++
++    // Construct kernel name
++    std::string kernel_name = "pixelwise_div";
++    kernel_name += (scale_int >= 0) ? "_int" : "_float";
++
++    // Set kernel build options
++    std::set<std::string> build_opts;
++    build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
++    build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
++    if(is_data_type_fixed_point(input1->info()->data_type()))
++    {
++        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
++    }
++    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
++    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
++    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
++    build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
++    build_opts.emplace("-D" + data_type);
++
++    // Create kernel
++    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
++
++    // Set scale argument
++    unsigned int idx = 3 * num_arguments_per_3D_tensor(); //Skip the inputs and output parameters
++
++    if(scale_int >= 0)
++    {
++        _kernel.setArg(idx++, scale_int);
++    }
++    else
++    {
++        _kernel.setArg(idx++, scale);
++    }
++
++    ICLKernel::configure(win_config.second);
++}
++
++Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
++                                                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
++    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
++    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
++
++    return Status{};
++}
++
++void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
++{
++    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
++
++    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
++    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
++    const TensorShape &out_shape = _output->info()->tensor_shape();
++
++    bool can_collapse = true;
++    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
++    {
++        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
++        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
++        {
++            can_collapse = (in_shape1[d] == in_shape2[d]);
++        }
++    }
++
++    bool   has_collapsed = false;
++    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
++
++    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
++    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
++
++    Window slice        = collapsed.first_slice_window_3D();
++    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
++    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
++
++    do
++    {
++        unsigned int idx = 0;
++        add_3D_tensor_argument(idx, _input1, slice_input1);
++        add_3D_tensor_argument(idx, _input2, slice_input2);
++        add_3D_tensor_argument(idx, _output, slice);
++        enqueue(queue, *this, slice);
++
++        collapsed.slide_window_slice_3D(slice_input1);
++        collapsed.slide_window_slice_3D(slice_input2);
++    }
++    while(collapsed.slide_window_slice_3D(slice));
++}
++
++BorderSize CLPixelWiseDivisionKernel::border_size() const
++{
++    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
++    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
++    return BorderSize(0, border, 0, 0);
++}
+diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+index f30ba61..8aa77ae 100644
+--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
++++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+@@ -32,6 +32,7 @@
+ #include "arm_compute/core/TensorInfo.h"
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/Window.h"
++#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ 
+ #include <cmath>
+ #include <cstdlib>
+@@ -50,8 +51,13 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+     ARM_COMPUTE_UNUSED(overflow_policy);
+     ARM_COMPUTE_UNUSED(rounding_policy);
+ 
+-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++    if (is_data_type_quantized_asymmetric(output->data_type()))
++    {
++        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input1);
++        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input2);
++    }
+     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
+ 
+     const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+@@ -69,7 +75,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+     // Validate in case of configured output
+     if(output->total_size() > 0)
+     {
+-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
++        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+                                         "Output can only be U8 if both inputs are U8");
+         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+@@ -188,7 +194,15 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I
+ 
+     // Construct kernel name
+     std::string kernel_name = "pixelwise_mul";
+-    kernel_name += (scale_int >= 0) ? "_int" : "_float";
++    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
++    {
++        compute_type = "qasymm8";
++        kernel_name += "_qasymm8";
++    }
++    else
++    {
++        kernel_name += (scale_int >= 0) ? "_int" : "_float";
++    }
+ 
+     // Set kernel build options
+     std::set<std::string> build_opts;
+@@ -204,6 +218,21 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I
+     build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
+     build_opts.emplace("-D" + data_type);
+ 
++    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
++    {
++        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input1->info()->quantization_info() : output->info()->quantization_info();
++
++        float multiplier = input1->info()->quantization_info().scale * input2->info()->quantization_info().scale / output_quant_info.scale;
++        int   output_multiplier, output_shift;
++        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
++
++        build_opts.emplace("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_multiplier));
++        build_opts.emplace("-DRESULT_SHIFT=" + support::cpp11::to_string(output_shift));
++        build_opts.emplace("-DIN1_OFFSET=" + support::cpp11::to_string(-(input1->info()->quantization_info().offset)));
++        build_opts.emplace("-DIN2_OFFSET=" + support::cpp11::to_string(-(input2->info()->quantization_info().offset)));
++        build_opts.emplace("-DRESULT_OFFSET=" + support::cpp11::to_string(output->info()->quantization_info().offset));
++        // TODO: Apply min-max BOUND to support fuse with relu.
++    }
+     // Create kernel
+     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ 
+diff --git a/src/core/CL/kernels/CLReduceMaxKernel.cpp b/src/core/CL/kernels/CLReduceMaxKernel.cpp
+new file mode 100644
+index 0000000..cb1ee03
+--- /dev/null
++++ b/src/core/CL/kernels/CLReduceMaxKernel.cpp
+@@ -0,0 +1,135 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
++
++#include "arm_compute/core/CL/CLHelpers.h"
++#include "arm_compute/core/CL/CLKernelLibrary.h"
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/OpenCL.h"
++#include "arm_compute/core/Error.h"
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/TensorInfo.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/Window.h"
++
++#include <cmath>
++#include <cstdlib>
++#include <set>
++#include <string>
++
++using namespace arm_compute;
++
++namespace
++{
++constexpr unsigned int num_elems_processed_per_iteration = 16;
++
++Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
++{
++    // We can handle for simple case only
++    // Input rank: 2
++    // Output rank: 1
++    // Axis: one axis value, restrict to 1
++
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
++
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, "Inputs are not broadcast compatible");
++
++    // Validate in case of configured output
++    if(output->total_size() > 0)
++    {
++        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
++        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
++                                        "Output same type allowed for input and output");
++        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1, "Only support for output dimension 1");
++        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2, "Only support for input dimension 2");
++    }
++
++    return Status{};
++}
++
++} // namespace
++
++CLReduceMaxKernel::CLReduceMaxKernel()
++    : _input(nullptr), _output(nullptr), _axis(0)
++{
++}
++
++void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
++    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
++
++    _input = input;
++    _output = output;
++    _axis = axis;
++
++    // Configure kernel window
++    int cols = _input->info()->tensor_shape()[0];
++    int rows = _input->info()->tensor_shape()[1];
++    Window win;
++    win.set(0, Window::Dimension(0, cols, 1));
++    win.set(1, Window::Dimension(0, rows, 1));
++
++    // Construct kernel name
++    std::string kernel_name = "reduce_max";
++
++    // Set kernel build options
++    std::set<std::string> build_opts;
++    build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
++
++    // Create kernel
++    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
++
++    ICLKernel::configure(win);
++}
++
++Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
++    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
++
++    return Status{};
++}
++
++void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
++{
++    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
++
++    Window window_input = window;
++    Window slice_input = window_input.first_slice_window_1D();
++
++    do
++    {
++        Window slice_output = slice_input.shift_dimensions(1);
++        unsigned int idx = 0;
++        add_1D_tensor_argument(idx, _input, slice_input);
++        add_1D_tensor_argument(idx, _output, slice_output);
++        enqueue(queue, *this, slice_input);
++
++    }
++    while(window_input.slide_window_slice_1D(slice_input));
++}
+diff --git a/src/core/CL/kernels/CLReductionMeanKernel.cpp b/src/core/CL/kernels/CLReductionMeanKernel.cpp
+new file mode 100644
+index 0000000..8e4dc38
+--- /dev/null
++++ b/src/core/CL/kernels/CLReductionMeanKernel.cpp
+@@ -0,0 +1,190 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
++
++#include "arm_compute/core/AccessWindowStatic.h"
++#include "arm_compute/core/CL/CLHelpers.h"
++#include "arm_compute/core/CL/CLKernelLibrary.h"
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/FixedPoint.h"
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/TensorInfo.h"
++#include "arm_compute/core/Utils.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/Window.h"
++
++#include "support/ToolchainSupport.h"
++
++using namespace arm_compute;
++
++namespace
++{
++Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis)
++{
++    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
++    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
++    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
++
++    std::vector<uint32_t>::const_iterator it;
++    bool axis_w = false;
++    bool axis_h = false;
++    for(it=axis.begin(); it!=axis.end(); ++it){
++         if((*it) == 0 )
++        {
++            axis_w = true;
++        }
++        else if((*it) == 1 )
++        {
++            axis_h = true;
++        }
++        else{
++            ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
++        }
++    }
++    //TODO Other axises (currently, only axises for both width and height are supported.)
++    if( !axis_w || !axis_h)
++    {
++        ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
++    }
++
++    if(output->total_size() != 0)
++    {
++        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
++        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
++    }
++
++    return Status{};
++}
++
++std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, std::vector<uint32_t> axis)
++{
++    // Output tensor auto initialization if not yet initialized
++    TensorShape output_shape{ input->tensor_shape() };
++    output_shape.set(0, 1);
++    output_shape.set(1, 1);
++    auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(), input->fixed_point_position());
++
++    // Configure kernel window
++    constexpr unsigned int num_elems_processed_per_iteration_x = 8; //step
++    const unsigned int     num_elems_processed_per_iteration_y = input->dimension(1);
++
++    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
++    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
++    AccessWindowHorizontal output_access(output, 0, 1);
++    bool window_changed = update_window_and_padding(win, input_access,output_access);
++    output_access.set_valid_region(win, output->valid_region());
++
++    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
++
++    return std::make_tuple(err, win);
++}
++} // namespace
++
++CLReductionMeanKernel::CLReductionMeanKernel()
++    : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size()
++{
++}
++
++BorderSize CLReductionMeanKernel::border_size() const
++{
++    return _border_size;
++}
++
++void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
++
++    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis));
++
++    _input          = input;
++    _output         = output;
++    _reduction_axis = axis;
++
++    constexpr unsigned int num_elems_processed_per_iteration_x = 8; //step
++
++    // Set border size
++    _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - input->info()->dimension(0));
++
++    // Set build options
++    std::set<std::string> build_opts;
++    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
++    // build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
++    if(is_data_type_fixed_point(input->info()->data_type()))
++    {
++        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
++    }
++
++    // Create kernel
++    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_mean", build_opts));
++
++    // Configure kernel window
++    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
++
++    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
++
++    ICLKernel::configure(std::get<1>(win_config));
++}
++
++Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis)
++{
++    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
++    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
++
++    return Status{};
++}
++
++void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue)
++{
++    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++    // Set out window
++    Window out_window(window);
++    out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
++
++    // Get first input and output slices
++    Window in_slice = window.first_slice_window_2D();
++    Window out_slice = out_window.first_slice_window_2D();
++
++    // Set local sums buffer
++    // TODO work_group
++    unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
++
++    unsigned int idx = 2 * num_arguments_per_2D_tensor();
++    _kernel.setArg(idx++, local_sum_size, nullptr);
++    _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1)));//height
++    _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0)*_input->info()->dimension(1)));//divider
++
++    do
++    {
++        unsigned int idx = 0;
++        add_2D_tensor_argument(idx, _input, in_slice);
++        in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
++        add_2D_tensor_argument(idx, _output, out_slice);
++        enqueue(queue, *this, in_slice);
++    }
++    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
++}
+diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
+new file mode 100644
+index 0000000..b57cf20
+--- /dev/null
++++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
+@@ -0,0 +1,316 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
++
++#include "arm_compute/core/AccessWindowStatic.h"
++#include "arm_compute/core/CL/CLHelpers.h"
++#include "arm_compute/core/CL/CLKernelLibrary.h"
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/OpenCL.h"
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/IAccessWindow.h"
++#include "arm_compute/core/TensorInfo.h"
++#include "arm_compute/core/Utils.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/Window.h"
++
++#include <string>
++
++
++using namespace std;
++using namespace arm_compute;
++
++static const int32_t maxDim = 4;
++
++CLStridedSliceKernel::CLStridedSliceKernel()
++    : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
++{
++}
++
++Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *begin, const ITensorInfo *end, const ITensorInfo *strides, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask)
++{
++    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
++    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
++                                                  DataType::U16, DataType::S16, DataType::QS16,
++                                                  DataType::U32, DataType::S32, DataType::F16, DataType::F32);
++    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
++    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
++    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
++    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
++
++    ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4);
++    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(), strides->tensor_shape());
++
++    return Status{};
++}
++
++// Return the index for the first element along that axis. This index will be a
++// positive integer between [0, axisSize - 1] that can be used to index
++// directly into the data.
++inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride, const TensorShape &inputShape, int32_t axis)
++{
++    // Begin with the specified index
++    int32_t start = begin;
++
++    // beginMask override
++    if (beginMask & 1 << axis)
++    {
++        if (stride > 0)
++        {
++            // Forward iteration - use the first element. These values will get
++            // clamped below (Note: We could have set them to 0 and axisSize-1, but
++            // use lowest() and max() to maintain symmetry with StopForAxis())
++            start = std::numeric_limits<int32_t>::lowest();
++        }
++        else
++        {
++            // Backward iteration - use the last element.
++            start = std::numeric_limits<int32_t>::max();
++        }
++    }
++
++    // Handle negative indices
++    int32_t axisSize = inputShape[axis];
++    if (start < 0)
++    {
++        start += axisSize;
++    }
++
++    // Clamping
++    start = arm_compute::utility::clamp(start, 0, axisSize - 1);
++
++    return start;
++}
++
++// Return the "real" index for the end of iteration along that axis. This is an
++// "end" in the traditional C sense, in that it points to one past the last
++// element. ie. So if you were iterating through all elements of a 1D array of
++// size 4, this function would return 4 as the stop, because it is one past the
++// "real" indices of 0, 1, 2 & 3.
++inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, const TensorShape &inputShape, int32_t axis)
++{
++    // Begin with the specified index
++    int32_t stop = end;
++
++    // endMask override
++    if (endMask & (1 << axis))
++    {
++        if (stride > 0)
++        {
++            // Forward iteration - use the last element. These values will get
++            // clamped below
++            stop = std::numeric_limits<int32_t>::max();
++        }
++        else
++        {
++            // Backward iteration - use the first element.
++            stop = std::numeric_limits<int32_t>::lowest();
++        }
++    }
++
++    // Handle negative indices
++    int32_t axisSize = inputShape[axis];
++    if (stop < 0) {
++        stop += axisSize;
++    }
++
++    // Clamping
++    // Because the end index points one past the last element, we need slightly
++    // different clamping ranges depending on the direction.
++    if (stride > 0)
++    {
++        // Forward iteration
++        stop = arm_compute::utility::clamp(stop, 0, axisSize);
++    }
++    else
++    {
++        // Backward iteration
++        stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
++    }
++
++    return stop;
++}
++
++inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
++{
++    int32_t offset = b * shape[2] * shape[1] * shape[0];
++    offset += d * shape[1] * shape[0];
++    offset += h * shape[0];
++    offset += w;
++    return offset;
++}
++
++inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
++{
++    int32_t ret = 0;
++    if (stride > 0)
++    {
++        ret = ((stop - start - 1) / stride) + 1;
++    }
++    else
++    {
++        ret = ((stop - start + 1) / stride) + 1;
++    }
++    ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number");
++    return ret;
++}
++
++void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask)
++{
++    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), beginMask, endMask, shrinkAxisMask));
++
++    _input = input;
++    _output = output;
++    _beginData = beginData;
++    _endData = endData;
++    _stridesData = stridesData;
++    _beginMask = beginMask;
++    _endMask = endMask;
++    _shrinkAxisMask = shrinkAxisMask;
++
++    constexpr unsigned int num_elems_processed_per_iteration = 1;
++
++    // Set kernel build options
++    std::set<std::string> build_opts;
++    build_opts.emplace("-DELEMENT_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
++    build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
++
++    // Create kernel
++    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("strided_slice", build_opts));
++
++    // Create output's window without padding
++    TensorShape collapsed = output->info()->tensor_shape();
++    collapsed.collapse(4);
++    TensorInfo info = *output->info();
++    info.set_tensor_shape(collapsed);
++    Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration));
++
++    ICLKernel::configure(win);
++}
++
++void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
++{
++    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++    // Create input window
++    TensorShape collapsed = _input->info()->tensor_shape();
++    collapsed.collapse(4);
++    TensorInfo info = *_input->info();
++    info.set_tensor_shape(collapsed);
++    Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size()));
++
++    _beginData->map(queue);
++    _endData->map(queue);
++    _stridesData->map(queue);
++
++    std::vector<int32_t> dimsIn;
++    std::vector<int32_t> dimsOut;
++    std::vector<int32_t> starts;
++    std::vector<int32_t> stops;
++    std::vector<int32_t> strides;
++
++    for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
++    {
++        const TensorShape shape = _input->info()->tensor_shape();
++        starts.emplace_back(StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
++                                           reinterpret_cast<int32_t *>(_stridesData->buffer())[n],
++                                           shape, n));
++
++        stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n],
++                                           reinterpret_cast<int32_t *>(_stridesData->buffer())[n],
++                                           shape, n));
++
++        strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
++        dimsIn.emplace_back(shape[n]);
++        dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n]));
++    }
++
++    for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) {
++        starts.emplace_back(0);
++        stops.emplace_back(1);
++        strides.emplace_back(1);
++        dimsIn.emplace_back(1);
++        dimsOut.emplace_back(1);
++    }
++    // TODO: Apply shrinkAxisMask
++
++    _beginData->unmap(queue);
++    _stridesData->unmap(queue);
++    _endData->unmap(queue);
++
++    // Set parameters
++    unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
++    const cl_int4 dimsInArg =
++    {
++        {
++            static_cast<cl_int>(dimsIn[0]),
++            static_cast<cl_int>(dimsIn[1]),
++            static_cast<cl_int>(dimsIn[2]),
++            static_cast<cl_int>(dimsIn[3]),
++        }
++    };
++    _kernel.setArg<cl_int4>(idx++, dimsInArg);
++
++    const cl_int4 dimsOutArg =
++    {
++        {
++            static_cast<cl_int>(dimsOut[0]),
++            static_cast<cl_int>(dimsOut[1]),
++            static_cast<cl_int>(dimsOut[2]),
++            static_cast<cl_int>(dimsOut[3]),
++        }
++    };
++    _kernel.setArg<cl_int4>(idx++, dimsOutArg);
++
++    const cl_int4 startsArg =
++    {
++        {
++            static_cast<cl_int>(starts[0]),
++            static_cast<cl_int>(starts[1]),
++            static_cast<cl_int>(starts[2]),
++            static_cast<cl_int>(starts[3]),
++        }
++    };
++    _kernel.setArg<cl_int4>(idx++, startsArg);
++
++    const cl_int4 stridesArg =
++    {
++        {
++            static_cast<cl_int>(strides[0]),
++            static_cast<cl_int>(strides[1]),
++            static_cast<cl_int>(strides[2]),
++            static_cast<cl_int>(strides[3]),
++        }
++    };
++    _kernel.setArg<cl_int4>(idx++, stridesArg);
++
++    // TODO: Apply slicing output's window
++    idx = 0;
++    add_1D_tensor_argument(idx, _input, win_in);
++    add_1D_tensor_argument(idx, _output, window);
++
++    enqueue(queue, *this, window);
++}
+diff --git a/src/core/CL/kernels/CLTopKV2Kernel.cpp b/src/core/CL/kernels/CLTopKV2Kernel.cpp
+new file mode 100644
+index 0000000..08cc6bc
+--- /dev/null
++++ b/src/core/CL/kernels/CLTopKV2Kernel.cpp
+@@ -0,0 +1,479 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
++
++#include "arm_compute/core/CL/CLHelpers.h"
++#include "arm_compute/core/CL/CLKernelLibrary.h"
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/Window.h"
++
++#include <climits>
++#include <cassert>
++
++namespace arm_compute
++{
++////////////////////////////////////////////////////////////////////////////////
++CLTopKV2Single::CLTopKV2Single()
++  : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr)
++{}
++
++void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values,
++    ICLTensor *topk_indices, cl::Buffer *indices,
++    cl::Buffer *temp_stack, int k, int n)
++{
++  ARM_COMPUTE_ERROR_ON(input == nullptr && indices== nullptr);
++  ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
++  ARM_COMPUTE_ERROR_ON(n == 0);
++
++  _input = input;
++  _topk_values = topk_values;
++  _topk_indices = topk_indices;
++
++  // Set kernel build options
++  std::set<std::string> build_opts;
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_quicksort", build_opts));
++
++  unsigned int idx = 3*num_arguments_per_1D_tensor();
++  _kernel.setArg(idx++, *indices);
++  _kernel.setArg(idx++, *temp_stack);
++  _kernel.setArg<cl_int>(idx++, k);
++  _kernel.setArg<cl_int>(idx++, n);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, 1, 1));
++  ICLKernel::configure(win);
++}
++
++void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  unsigned int idx = 0;
++  add_1D_tensor_argument(idx, _input, window);
++  add_1D_tensor_argument(idx, _topk_values, window);
++  add_1D_tensor_argument(idx, _topk_indices, window);
++
++  enqueue(queue, *this, window);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++CLTopKV2Init::CLTopKV2Init()
++  : _input(nullptr)
++{}
++
++void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer* in_key_buf,
++    cl::Buffer* in_ind_buf, int n)
++{
++  ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
++  ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
++  ARM_COMPUTE_ERROR_ON(n == 0);
++
++  _input = input;
++
++  // Set kernel build options
++  std::set<std::string> build_opts;
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_init", build_opts));
++
++  unsigned int idx = num_arguments_per_1D_tensor();
++  _kernel.setArg(idx++, *in_key_buf);
++  _kernel.setArg(idx++, *in_ind_buf);
++  _kernel.setArg<cl_int>(idx++, n);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, n, 1));
++  ICLKernel::configure(win);
++}
++
++void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  unsigned int idx = 0;
++  add_1D_tensor_argument(idx, _input, window);
++
++  enqueue(queue, *this, window);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++// This kernel makes a histogram of radix for each work item.
++CLRadixSortHistogram::CLRadixSortHistogram()
++: _pass(0), _in_key_buf(nullptr)
++{}
++
++void CLRadixSortHistogram::configure(cl::Buffer* hist_buf, int bits, int n)
++{
++  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
++
++  unsigned int radix = 1 << bits;
++  // Set kernel build options
++  std::set<std::string> build_opts;
++  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
++  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
++  build_opts.emplace("-DPERMUT=1");
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_histogram", build_opts));
++
++  int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
++
++  unsigned int idx = 1;
++  _kernel.setArg(idx++, *hist_buf);
++
++  idx = 3;
++  _kernel.setArg(idx++, loc_histo_size, nullptr);
++  _kernel.setArg<cl_int>(idx++, n);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, _GROUPS*_ITEMS, 1));
++  ICLKernel::configure(win);
++}
++
++void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  _kernel.setArg(0, *_in_key_buf);
++  _kernel.setArg<cl_int>(2, _pass);
++
++  cl::NDRange lws   = cl::NDRange(_ITEMS, 1);
++
++  enqueue(queue, *this, window, lws);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++CLRadixSortScanHistogram::CLRadixSortScanHistogram()
++{}
++
++void CLRadixSortScanHistogram::configure(cl::Buffer* hist_buf, cl::Buffer* glob_sum_buf, int bits)
++{
++  ARM_COMPUTE_ERROR_ON(hist_buf  == nullptr && glob_sum_buf == nullptr);
++
++  unsigned int radix = 1 << bits;
++  // Set kernel build options
++  std::set<std::string> build_opts;
++  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
++  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
++  build_opts.emplace("-DPERMUT=1");
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_scanhistograms", build_opts));
++
++  int temp_size = std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
++
++  unsigned int idx = 0;
++  _kernel.setArg(idx++, *hist_buf);
++  _kernel.setArg(idx++, temp_size, nullptr);
++  _kernel.setArg(idx++, *glob_sum_buf);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS/2, 1));
++  ICLKernel::configure(win);
++}
++
++void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
++  cl::NDRange lws   = cl::NDRange(gws_x/_HISTOSPLIT, 1);
++
++  enqueue(queue, *this, window, lws);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram()
++{}
++
++void CLRadixSortGlobalScanHistogram::configure(cl::Buffer* glob_sum_buf, cl::Buffer* temp_buf, int bits)
++{
++  ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
++
++  unsigned int radix = 1 << bits;
++  // Set kernel build options
++  std::set<std::string> build_opts;
++  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
++  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
++  build_opts.emplace("-DPERMUT=1");
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_scanhistograms", build_opts));
++
++  int temp_size = std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
++
++  unsigned int idx = 0;
++  _kernel.setArg(idx++, *glob_sum_buf);
++  _kernel.setArg(idx++, temp_size, nullptr);
++  _kernel.setArg(idx++, *temp_buf);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, _HISTOSPLIT/2, 1));
++  ICLKernel::configure(win);
++}
++
++void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
++  cl::NDRange lws   = cl::NDRange(gws_x, 1);
++
++  enqueue(queue, *this, window, lws);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++CLRadixSortPasteHistogram::CLRadixSortPasteHistogram()
++{}
++
++void CLRadixSortPasteHistogram::configure(cl::Buffer* hist_buf, cl::Buffer* glob_sum_buf, int bits)
++{
++  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
++
++  unsigned int radix = 1 << bits;
++  // Set kernel build options
++  std::set<std::string> build_opts;
++  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
++  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
++  build_opts.emplace("-DPERMUT=1");
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_pastehistograms", build_opts));
++
++  unsigned int idx = 0;
++  _kernel.setArg(idx++, *hist_buf);
++  _kernel.setArg(idx++, *glob_sum_buf);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
++  ICLKernel::configure(win);
++}
++
++void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
++  cl::NDRange lws   = cl::NDRange(gws_x/_HISTOSPLIT, 1);
++
++  enqueue(queue, *this, window, lws);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++CLRadixSortReorder::CLRadixSortReorder()
++: _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr),
++  _in_ind_buf(nullptr), _out_ind_buf(nullptr)
++{}
++
++void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
++{
++  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
++  ARM_COMPUTE_ERROR_ON(n == 0);
++
++  unsigned int radix = 1 << bits;
++  // Set kernel build options
++  std::set<std::string> build_opts;
++  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
++  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
++  build_opts.emplace("-DPERMUT=1");
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("radixsort_reorder", build_opts));
++
++  unsigned int idx = 2;
++  _kernel.setArg(idx++, *hist_buf);
++
++  idx = 6;
++  _kernel.setArg(idx++, sizeof(uint)* radix * _ITEMS, nullptr);
++  _kernel.setArg<cl_int>(idx++, n);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
++  ICLKernel::configure(win);
++}
++
++void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
++  unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
++  cl::NDRange lws   = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
++
++  _kernel.setArg(0, *_in_key_buf);
++  _kernel.setArg(1, *_out_key_buf);
++  _kernel.setArg<cl_int>(3, _pass);
++  _kernel.setArg(4, *_in_ind_buf);
++  _kernel.setArg(5, *_out_ind_buf);
++
++  enqueue(queue, *this, window, lws);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative()
++: _out_key_buf(nullptr)
++{}
++
++void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
++{
++  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
++  ARM_COMPUTE_ERROR_ON(n == 0);
++
++  // Set kernel build options
++  std::set<std::string> build_opts;
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_find_first_negative", build_opts));
++
++  unsigned int idx = 1;
++  _kernel.setArg(idx++, *first_negative_idx_buf);
++  _kernel.setArg<cl_int>(idx++, n);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, n, 1));
++  ICLKernel::configure(win);
++}
++
++void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  unsigned int idx = 0;
++  _kernel.setArg(idx++, *_out_key_buf);
++
++  enqueue(queue, *this, window);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
++: _in_key_buf(nullptr), _out_key_buf(nullptr),
++  _in_ind_buf(nullptr), _out_ind_buf(nullptr)
++{}
++
++void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
++{
++  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
++  ARM_COMPUTE_ERROR_ON(n == 0);
++
++  // Set kernel build options
++  std::set<std::string> build_opts;
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_reorder_negatives", build_opts));
++
++  unsigned int idx = 4;
++  _kernel.setArg(idx++, *first_negative_idx_buf);
++  _kernel.setArg<cl_int>(idx++, n);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, n, 1));
++  ICLKernel::configure(win);
++}
++
++void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  unsigned int idx = 0;
++  _kernel.setArg(idx++, *_in_key_buf);
++  _kernel.setArg(idx++, *_out_key_buf);
++  _kernel.setArg(idx++, *_in_ind_buf);
++  _kernel.setArg(idx++, *_out_ind_buf);
++
++  enqueue(queue, *this, window);
++}
++
++////////////////////////////////////////////////////////////////////////////////
++CLTopKV2Store::CLTopKV2Store()
++: _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
++{}
++
++void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
++{
++  ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
++  ARM_COMPUTE_ERROR_ON(k == 0);
++  ARM_COMPUTE_ERROR_ON(k > n);
++
++  _values = values;
++  _indices = indices;
++
++  // Set kernel build options
++  std::set<std::string> build_opts;
++
++  // Create kernel
++  _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("topkv2_store", build_opts));
++
++  unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
++  _kernel.setArg<cl_int>(idx++, n);
++
++  // Configure kernel window
++  Window                 win;
++  win.set(0, Window::Dimension(0, k, 1));
++  ICLKernel::configure(win);
++}
++
++void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
++{
++  _out_key_buf = out_key_buf;
++  _out_ind_buf = out_ind_buf;
++}
++
++void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
++{
++  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
++  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
++
++  unsigned int idx = 0;
++  add_1D_tensor_argument(idx, _values, window);
++  add_1D_tensor_argument(idx, _indices, window);
++  _kernel.setArg(idx++, *_out_key_buf);
++  _kernel.setArg(idx++, *_out_ind_buf);
++
++  enqueue(queue, *this, window);
++}
++
++} // namespace arm_compute
+diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
+index d4fabd4..d0374e7 100644
+--- a/src/core/Validate.cpp
++++ b/src/core/Validate.cpp
+@@ -169,7 +169,7 @@ arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function
+     // Subtensor should not index in x, y dimensions.
+     ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords.x() != 0) || (coords.y() != 0)), function, file, line);
+     // Subtensor shape should match parent tensor in x, y dimensions.
+-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) || (parent_shape.y() != parent_shape.y())), function, file, line);
++    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) || (parent_shape.y() != shape.y())), function, file, line);
+ 
+     // Check dimensions
+     for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+index 5fca30c..3c13e3d 100644
+--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
++++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+  * Copyright (c) 2016, 2017 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+@@ -23,6 +24,7 @@
+  */
+ #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+ 
++#include "arm_compute/core/CL/ICLTensor.h"
+ #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+ #include "support/ToolchainSupport.h"
+ 
+@@ -30,11 +32,21 @@
+ 
+ using namespace arm_compute;
+ 
+-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
++void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+ {
+     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
+     k->configure(input1, input2, output, policy);
+     _kernel = std::move(k);
++
++    if(output->info()->dimension(0) > 1)
++    {
++        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
++
++        if(broadcasted_info->info()->dimension(0) == 1)
++        {
++            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
++        }
++    }
+ }
+ 
+ Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
+new file mode 100644
+index 0000000..4669577
+--- /dev/null
++++ b/src/runtime/CL/functions/CLCast.cpp
+@@ -0,0 +1,37 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLCast.h"
++
++#include "arm_compute/core/CL/kernels/CLCastKernel.h"
++#include "support/ToolchainSupport.h"
++
++using namespace arm_compute;
++
++void CLCast::configure(ICLTensor *input, ICLTensor *output)
++{
++    auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
++    k->configure(input, output);
++    _kernel = std::move(k);
++}
+diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
+new file mode 100644
+index 0000000..3f2f2c1
+--- /dev/null
++++ b/src/runtime/CL/functions/CLGather.cpp
+@@ -0,0 +1,45 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLGather.h"
++
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
++#include "support/ToolchainSupport.h"
++
++#include <utility>
++
++using namespace arm_compute;
++
++void CLGather::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
++{
++    auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
++    k->configure(input1, input2, output);
++    _kernel = std::move(k);
++}
++
++Status CLGather::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
++{
++    return CLGatherKernel::validate(input1, input2, output);
++}
+diff --git a/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/src/runtime/CL/functions/CLPixelWiseDivision.cpp
+new file mode 100644
+index 0000000..343e944
+--- /dev/null
++++ b/src/runtime/CL/functions/CLPixelWiseDivision.cpp
+@@ -0,0 +1,57 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2016-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h"
++
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
++#include "support/ToolchainSupport.h"
++
++#include <utility>
++
++using namespace arm_compute;
++
++void CLPixelWiseDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
++                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
++{
++    auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseDivisionKernel>();
++    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
++    _kernel = std::move(k);
++
++    if(output->info()->dimension(0) > 1)
++    {
++        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
++
++        if(broadcasted_info->info()->dimension(0) == 1)
++        {
++            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
++        }
++    }
++}
++
++Status CLPixelWiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
++                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
++{
++    return CLPixelWiseDivisionKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
++}
+diff --git a/src/runtime/CL/functions/CLReduceMax.cpp b/src/runtime/CL/functions/CLReduceMax.cpp
+new file mode 100644
+index 0000000..276ffd2
+--- /dev/null
++++ b/src/runtime/CL/functions/CLReduceMax.cpp
+@@ -0,0 +1,132 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLReduceMax.h"
++
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "support/ToolchainSupport.h"
++#include "arm_compute/core/CL/CLHelpers.h"
++#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
++
++#include <vector>
++#include <algorithm>
++
++#include <utility>
++
++#define REDUCE_MAX_RUN_ON_CPU 1
++
++namespace arm_compute
++{
++
++CLReduceMax::CLReduceMax()
++: _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr)
++{
++}
++
++void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output)
++{
++  _axis = axis;
++
++  _input = input;
++  _output = output;
++
++  auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
++  k->configure(input, axis, output);
++  _kernel = std::move(k);
++
++  // We can handle for simple case only
++  // Output rank: 1
++  // Axis: one axis value, restrict to 1
++  ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2);
++  ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1);
++  ARM_COMPUTE_ERROR_ON(axis != 1);
++}
++
++Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
++{
++    return CLReduceMaxKernel::validate(input, axis, output);
++}
++
++void CLReduceMax::run()
++{
++#if REDUCE_MAX_RUN_ON_CPU
++    run_on_cpu();
++
++    arm_compute::CLScheduler::get().sync();
++#else
++    arm_compute::CLScheduler::get().enqueue(*_kernel);
++#endif
++}
++
++void CLReduceMax::run_on_cpu()
++{
++  cl::CommandQueue q = CLScheduler::get().queue();
++
++  _input->map(q);
++  _output->map(q);
++
++  // Compute by CPU for simple case
++  // Input rank: 2
++  // Output rank: 1
++  // Axis: one axis value, restrict to 1
++
++  float* input_data = (float*)_input->buffer();
++  float* output_data = (float*)_output->buffer();
++
++  std::vector<float> container_max;
++  int cols = _input->info()->tensor_shape()[0];
++  int rows = _input->info()->tensor_shape()[1];
++  container_max.resize(rows);
++
++  // Initialize as 1st element in row
++  float* input_pointer = input_data;
++  for (int i = 0; i < rows; i++)
++  {
++      container_max[i] = *input_pointer;
++      input_pointer += cols;
++  }
++
++  // Update max value in row
++  for (int i = 0; i < rows; i++)
++  {
++      float max_in_row = container_max[i];
++      for (int j = 1; j < cols; j++)
++      {
++          if (max_in_row < input_data[i * cols + j])
++          {
++              max_in_row = input_data[i * cols + j];
++          }
++      }
++      container_max[i] = max_in_row;
++  }
++
++  for (int i = 0; i < rows; i++)
++  {
++      output_data[i] = container_max[i];
++  }
++
++  _input->unmap(q);
++  _output->unmap(q);
++}
++} // namespace arm_compute
+diff --git a/src/runtime/CL/functions/CLReductionMean.cpp b/src/runtime/CL/functions/CLReductionMean.cpp
+new file mode 100644
+index 0000000..4f71e84
+--- /dev/null
++++ b/src/runtime/CL/functions/CLReductionMean.cpp
+@@ -0,0 +1,60 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017-2018 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLReductionMean.h"
++
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
++#include "arm_compute/core/Error.h"
++#include "arm_compute/core/PixelValue.h"
++#include "arm_compute/core/TensorInfo.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/runtime/CL/CLScheduler.h"
++#include "arm_compute/runtime/Tensor.h"
++#include "support/ToolchainSupport.h"
++
++using namespace arm_compute;
++
++CLReductionMean::CLReductionMean()
++    : _reduction_mean_kernel(), _fill_border_kernel()
++{
++}
++
++Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output, std::vector<uint32_t> axis)
++{
++    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis));
++    return Status{};
++}
++
++void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis)
++{
++    _reduction_mean_kernel.configure(input, output, axis);
++    _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
++}
++
++void CLReductionMean::run()
++{
++    CLScheduler::get().enqueue(_fill_border_kernel);
++    CLScheduler::get().enqueue(_reduction_mean_kernel);
++}
+diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
+new file mode 100644
+index 0000000..2695fc6
+--- /dev/null
++++ b/src/runtime/CL/functions/CLStridedSlice.cpp
+@@ -0,0 +1,288 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
++
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
++#include "arm_compute/core/utils/misc/Utility.h"
++#include "arm_compute/runtime/CL/CLScheduler.h"
++#include "support/ToolchainSupport.h"
++#include <vector>
++
++using namespace arm_compute;
++
++static const int32_t maxDims = 4;
++
++// Return the index for the first element along that axis. This index will be a
++// positive integer between [0, axisSize - 1] that can be used to index
++// directly into the data.
++inline int32_t StartForAxis(int32_t beginMask,
++                            std::vector<int32_t> const &startIndices,
++                            std::vector<int32_t> const &strides,
++                            const TensorShape &inputShape, int32_t axis)
++{
++    // Begin with the specified index
++    int32_t start = startIndices[axis];
++
++    // beginMask override
++    if (beginMask & 1 << axis)
++    {
++        if (strides[axis] > 0)
++        {
++            // Forward iteration - use the first element. These values will get
++            // clamped below (Note: We could have set them to 0 and axisSize-1, but
++            // use lowest() and max() to maintain symmetry with StopForAxis())
++            start = std::numeric_limits<int32_t>::lowest();
++        }
++        else
++        {
++            // Backward iteration - use the last element.
++            start = std::numeric_limits<int32_t>::max();
++        }
++    }
++
++    // Handle negative indices
++    int32_t axisSize = inputShape[axis];
++    if (start < 0)
++    {
++        start += axisSize;
++    }
++
++    // Clamping
++    start = arm_compute::utility::clamp(start, 0, axisSize - 1);
++
++    return start;
++}
++
++// Return the "real" index for the end of iteration along that axis. This is an
++// "end" in the traditional C sense, in that it points to one past the last
++// element. ie. So if you were iterating through all elements of a 1D array of
++// size 4, this function would return 4 as the stop, because it is one past the
++// "real" indices of 0, 1, 2 & 3.
++inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices,
++                           std::vector<int32_t> const &strides,
++                           const TensorShape &inputShape, int32_t axis)
++{
++    // Begin with the specified index
++    int32_t stop = stopIndices[axis];
++
++    // endMask override
++    if (endMask & (1 << axis))
++    {
++        if (strides[axis] > 0)
++        {
++            // Forward iteration - use the last element. These values will get
++            // clamped below
++            stop = std::numeric_limits<int32_t>::max();
++        }
++        else
++        {
++            // Backward iteration - use the first element.
++            stop = std::numeric_limits<int32_t>::lowest();
++        }
++    }
++
++    // Handle negative indices
++    int32_t axisSize = inputShape[axis];
++    if (stop < 0) {
++        stop += axisSize;
++    }
++
++    // Clamping
++    // Because the end index points one past the last element, we need slightly
++    // different clamping ranges depending on the direction.
++    if (strides[axis] > 0)
++    {
++        // Forward iteration
++        stop = arm_compute::utility::clamp(stop, 0, axisSize);
++    }
++    else
++    {
++        // Backward iteration
++        stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
++    }
++
++    return stop;
++}
++
++inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
++{
++    int32_t offset = b * shape[2] * shape[1] * shape[0];
++    offset += d * shape[1] * shape[0];
++    offset += h * shape[0];
++    offset += w;
++    return offset;
++}
++
++void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask)
++{
++    auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
++    k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
++    _kernel = std::move(k);
++}
++
++void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask)
++{
++    ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate(input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), beginMask, endMask, shrinkAxisMask));
++
++    _input          = input;
++    _output         = output;
++    _beginData      = beginData;
++    _endData        = endData;
++    _stridesData    = stridesData;
++    _beginMask      = beginMask;
++    _endMask        = endMask;
++    _shrinkAxisMask = shrinkAxisMask;
++}
++
++void CLStridedSliceCPU::run()
++{
++    run_on_cpu();
++
++    arm_compute::CLScheduler::get().sync();
++}
++
++inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
++{
++    if (stride > 0)
++    {
++        return ((stop - start - 1) / stride) + 1;
++    }
++    else
++    {
++        return ((stop - start + 1) / stride) + 1;
++    }
++}
++
++template <typename T>
++inline void StridedSlice(const T *inputData, const TensorShape &inputShape,
++                         int32_t beginMask, int32_t endMask,
++                         const std::vector<int32_t> &startIndices,
++                         const std::vector<int32_t> &stopIndices,
++                         const std::vector<int32_t> &strides, T *outputData)
++{
++    ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims);
++    ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims);
++    ARM_COMPUTE_ERROR_ON(strides.size() != maxDims);
++
++    const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3);
++    const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3);
++    const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2);
++    const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2);
++    const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1);
++    const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1);
++    const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0);
++    const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0);
++
++    // The shape of outputData may collapse in one-dimension.
++    // Therefore, it is necessary to create a shape that matches the result of the outputData.
++    TensorShape outputShape(getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]),
++                            getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3]));
++    for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b; in_b += strides[3], b++)
++    {
++        for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d; in_d += strides[2], d++)
++        {
++            for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h; in_h += strides[1], h++)
++            {
++                for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w; in_w += strides[0], w++)
++                {
++                    outputData[offset4D(outputShape, b, d, h, w)] = inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)];
++                }
++            }
++        }
++    }
++}
++
++void CLStridedSliceCPU::run_on_cpu()
++{
++    // TODO: Support shrinkAxisMask
++    cl::CommandQueue q = CLScheduler::get().queue();
++
++    _input->map(q);
++    _output->map(q);
++    _beginData->map(q);
++    _endData->map(q);
++    _stridesData->map(q);
++
++    TensorShape inputShape = _input->info()->tensor_shape();
++    TensorShape outputShape = _output->info()->tensor_shape();
++
++    std::vector<int32_t> starts;
++    std::vector<int32_t> stops;
++    std::vector<int32_t> strides;
++
++    for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx) {
++        starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]);
++        stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]);
++        strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]);
++    }
++
++    for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++) {
++        starts.emplace_back(0);
++        stops.emplace_back(1);
++        strides.emplace_back(1);
++    }
++
++    switch (_input->info()->data_type())
++    {
++        case DataType::U8:
++        case DataType::QASYMM8:
++            StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<uint8_t *>(_output->buffer()));
++            break;
++        case DataType::S8:
++        case DataType::QS8:
++            StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer()));
++            break;
++        case DataType::U16:
++            StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<uint16_t *>(_output->buffer()));
++            break;
++        case DataType::S16:
++        case DataType::QS16:
++            StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<int16_t *>(_output->buffer()));
++            break;
++        case DataType::F16:
++            // Not sure this works.
++            StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer()));
++            break;
++        case DataType::U32:
++            StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<uint32_t *>(_output->buffer()));
++            break;
++        case DataType::S32:
++            StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<int32_t *>(_output->buffer()));
++            break;
++        case DataType::F32:
++            StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask, _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer()));
++            break;
++        default:
++            ARM_COMPUTE_ERROR("DataType not supported");
++            break;
++    }
++
++    _input->unmap(q);
++    _output->unmap(q);
++    _beginData->unmap(q);
++    _endData->unmap(q);
++    _stridesData->unmap(q);
++}
+diff --git a/src/runtime/CL/functions/CLTopKV2.cpp b/src/runtime/CL/functions/CLTopKV2.cpp
+new file mode 100644
+index 0000000..ed9797e
+--- /dev/null
++++ b/src/runtime/CL/functions/CLTopKV2.cpp
+@@ -0,0 +1,310 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (c) 2017 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
++
++#include "arm_compute/core/CL/ICLTensor.h"
++#include "arm_compute/core/CL/CLHelpers.h"
++
++#include <vector>
++#include <algorithm>
++
++#include "../../topk_v2.h"
++
++namespace arm_compute
++{
++
++CLTopKV2::CLTopKV2()
++: _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0),
++  _glob_sum_buf_size(0), _n(0), _input(nullptr),
++  _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
++  _hist_buf(), _glob_sum_buf(), _temp_buf(),
++  _first_negative_idx_buf(), _in_key_buf(), _out_key_buf(), _in_ind_buf(), _out_ind_buf(),
++  _p_in_key_buf(nullptr), _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr),
++  _qs_kernel(),
++  _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), _paste_hist_kernel(),
++  _reorder_kernel(), _find_first_negative_kernel(), _reorder_negatives_kernel(),_store_kernel()
++{
++}
++
++void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
++    int total_bits, int bits)
++{
++    _total_bits = total_bits;
++    _bits = bits;
++    _n = input->info()->tensor_shape()[0];
++
++    // _total_bits should be divided by _bits.
++    ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
++
++    _k = k;
++    _radix = 1 << bits;
++
++    _input = input;
++    _values = values;
++    _indices = indices;
++
++    std::string topk_env;
++
++    char* env = getenv("ACL_TOPKV2");
++    if( env )
++      topk_env = env;
++
++    if(topk_env == "GPU_SINGLE")
++    {
++      _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_int) * _n);
++      _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_int) * _n);
++
++      _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
++    }
++    else if(topk_env == "GPU")
++    {
++      // n should be divided by (_GROUPS * _ITEMS)
++      ARM_COMPUTE_ERROR_ON((_n % (_GROUPS *_ITEMS)) != 0);
++
++      _hist_buf_size = _radix * _GROUPS * _ITEMS;
++      _glob_sum_buf_size = _HISTOSPLIT;
++
++      _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_int) * _hist_buf_size);
++      _glob_sum_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_int) * _glob_sum_buf_size);
++      _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_int) * _glob_sum_buf_size);
++      _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_int));
++      _in_key_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_float) * _n);
++      _out_key_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_float) * _n);
++      _in_ind_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_int) * _n);
++      _out_ind_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
++          sizeof(cl_int) * _n);
++
++      _p_in_key_buf = &_in_key_buf;
++      _p_out_key_buf = &_out_key_buf;
++      _p_in_ind_buf = &_in_ind_buf;
++      _p_out_ind_buf = &_out_ind_buf;
++
++      _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
++      _hist_kernel.configure(&_hist_buf, bits, _n);
++      _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
++      _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
++      _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
++      _reorder_kernel.configure(&_hist_buf, bits, _n);
++      _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
++      _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
++      _store_kernel.configure(values, indices, k, _n);
++    }
++    else
++    {
++      // DO NOTHING for CPU.
++    }
++}
++
++void CLTopKV2::run()
++{
++  std::string topk_env;
++
++  char* env = getenv("ACL_TOPKV2");
++  if( env )
++    topk_env = env;
++
++  if(topk_env == "GPU_SINGLE")
++  {
++    run_on_gpu_single_quicksort();
++  }
++  else if(topk_env == "GPU")
++  {
++    run_on_gpu();
++  }
++  else
++  {
++    run_on_cpu();
++  }
++}
++
++void CLTopKV2::run_on_gpu_single_quicksort()
++{
++  // This is a single threaded quick sort implementation.
++  CLScheduler::get().enqueue(_qs_kernel, false);
++
++  arm_compute::CLScheduler::get().sync();
++}
++
++void CLTopKV2::run_on_gpu()
++{
++  cl::CommandQueue q = CLScheduler::get().queue();
++
++  //1. CLTopKV2Init set key buffer and index buffer.
++  //  - Key buffer is set as the same value of the layer's input
++  //  - Values in the index buffer are set as their indices.
++  CLScheduler::get().enqueue(_init_kernel, false);
++
++  int n_passes = _total_bits / _bits;
++
++  // 2. Repeat (total_bits/bits) times.
++  //   - total_bits is the number of bits of the data type (e.g., 32 for float)
++  //   - bits defines number of buckets (e.g. 16 buckets where bit is 4)
++  for(int pass = 0; pass < n_passes; ++pass) {
++    arm_compute::CLScheduler::get().sync();
++
++    // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
++    _hist_kernel.setPass(pass, _p_in_key_buf);
++    CLScheduler::get().enqueue(_hist_kernel, false);
++
++    // 2.2. Calculate prefix sum locally with multiple threads
++    CLScheduler::get().enqueue(_scan_hist_kernel, false);
++    // 2.3. Calculate prefix sum within a work group
++    CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
++    // 2.4. Calculate global prefix sum
++    CLScheduler::get().enqueue(_paste_hist_kernel, false);
++
++    // 2.5. Reorder keys and indices based on the global prefix sum
++    _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf,
++        _p_in_ind_buf, _p_out_ind_buf);
++    CLScheduler::get().enqueue(_reorder_kernel, false);
++
++    cl::Buffer *tmp;
++    // swap key buffers
++    tmp = _p_in_key_buf;
++    _p_in_key_buf = _p_out_key_buf;
++    _p_out_key_buf = tmp;
++
++    // swap index buffers
++    tmp = _p_in_ind_buf;
++    _p_in_ind_buf = _p_out_ind_buf;
++    _p_out_ind_buf = tmp;
++  }
++
++  // 3. Get the first negative index
++  // Because we swap in_buf and out_buf at the end of the above for loop,
++  // the output buffers are in bufs.
++  _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
++  CLScheduler::get().enqueue(_find_first_negative_kernel, false);
++
++  // 4. Correct odering of negatives
++  //   - Since radix sort does not consider negatives, negatives are considered as bigger values than positives.
++  // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
++  _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf,
++      _p_in_ind_buf, _p_out_ind_buf);
++  CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
++
++  // 5. Extract top k values from sorted keys and indices.
++  _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
++  CLScheduler::get().enqueue(_store_kernel, false);
++
++  arm_compute::CLScheduler::get().sync();
++
++#if 0
++  // below code is left for debugging.
++  int first_neg;
++  q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
++  std::cout << "first neg = " << first_neg << std::endl;
++
++  float in_key[_n];
++  q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
++  for(uint32_t i = 0 ; i < _n; ++i) {
++    std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
++  }
++
++  float out_key[_n];
++  q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
++  for(uint32_t i = 0 ; i < _n; ++i) {
++    std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
++  }
++
++  int in_ind[_n];
++  q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
++  for(uint32_t i = 0 ; i < _n; ++i) {
++    std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
++  }
++
++  int out_ind[_n];
++  q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
++  for(uint32_t i = 0 ; i < _n; ++i) {
++    std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
++  }
++
++  int hist_buf[_hist_buf_size];
++  q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
++  for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
++    std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
++  }
++
++  int glob_sum_buf[_glob_sum_buf_size];
++  q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
++  for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
++    std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
++  }
++
++#endif
++}
++
++void CLTopKV2::run_on_cpu()
++{
++  cl::CommandQueue q = CLScheduler::get().queue();
++  //const Window& w = _topkv2_kernel.window();
++
++  _input->map(q);
++  _values->map(q);
++  _indices->map(q);
++
++  //int row_size = (w[0].end() - w[0].start()) / w[0].step();
++  int row_size = _input->info()->tensor_shape()[0];
++  int rank = _input->info()->num_dimensions();
++
++  if (rank > 2)
++    throw std::runtime_error("Not supported type.");
++
++  int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
++
++  if (_input->info()->data_type() == DataType::F32)
++  {
++    nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float*)_input->buffer(), _k,
++        (int32*)_indices->buffer(), (float*)_values->buffer());
++  }
++  else if (_input->info()->data_type() == DataType::S32)
++  {
++    nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t*)_input->buffer(), _k,
++        (int32*)_indices->buffer(), (int32_t*)_values->buffer());
++  }
++  else if (_input->info()->data_type() == DataType::QASYMM8)
++  {
++    nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t*)_input->buffer(), _k,
++        (int32*)_indices->buffer(), (uint8_t*)_values->buffer());
++  }
++  else
++  {
++    throw std::runtime_error("Not supported type.");
++  }
++
++  _input->unmap(q);
++  _values->unmap(q);
++  _indices->unmap(q);
++}
++} // namespace arm_compute
+diff --git a/src/runtime/topk_v2.h b/src/runtime/topk_v2.h
+new file mode 100644
+index 0000000..2419ee9
+--- /dev/null
++++ b/src/runtime/topk_v2.h
+@@ -0,0 +1,141 @@
++/*
++ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright (C) 2017 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
++#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
++
++typedef int32_t int32;
++
++namespace nnfw
++{
++namespace rt
++{
++namespace optimized_ops
++{
++// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
++// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
++// TFLite.
++//(TFLite additionaly supports kTfLiteInt64.)
++
++// The class that collects top indexes of k values. Based on template
++// tensorflow::gtl::TopN<> but, for optimization,
++// it re-uses the same container.
++template <typename T> class TopContainer
++{
++public:
++  TopContainer() = delete;
++  TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
++  { container_.reserve(std::min(k, row_size) + 1); }
++
++  /** Prevent instances of this class from being copied (As this class contains pointers) */
++  TopContainer(const TopContainer&) = delete;
++    /** Prevent instances of this class from being copied (As this class contains pointers) */
++  TopContainer& operator=(const TopContainer&) = delete;
++
++  void start_collecting(const T *values)
++  {
++    values_ = values;
++    container_.clear();
++  }
++
++  void push(int32 a)
++  {
++    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
++    if (container_.size() <= (size_t)k_)
++    {
++      container_.push_back(a);
++      if (container_.size() == (size_t)(k_ + 1))
++      {
++        std::make_heap(container_.begin(), container_.end(), comparator);
++        std::pop_heap(container_.begin(), container_.end(), comparator);
++      }
++    }
++    else if (comparator(a, container_.front()))
++    {
++      container_.back() = a;
++      std::push_heap(container_.begin(), container_.end(), comparator);
++      std::pop_heap(container_.begin(), container_.end(), comparator);
++    }
++  }
++
++  const std::vector<int32> &sorted_result()
++  {
++    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
++    if (container_.size() <= (size_t)(k_))
++    {
++      std::sort(container_.begin(), container_.end(), comparator);
++    }
++    else
++    {
++      std::sort_heap(container_.begin(), container_.end() - 1, comparator);
++      container_.resize(k_);
++    }
++    return container_;
++  }
++
++private:
++  int32 k_;
++  std::vector<int32> container_;
++  const T *values_ = nullptr;
++
++  bool compare_fun(int32 a, int32 b) const
++  {
++    if (values_[b] < values_[a])
++    {
++      return true;
++    }
++    else if (values_[b] > values_[a])
++    {
++      return false;
++    }
++    else
++    {
++      return a < b;
++    }
++  }
++};
++
++template <typename T>
++void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
++          T *output_values)
++{
++  TopContainer<T> topc(k, row_size);
++  for (int row = 0; row < num_rows; ++row)
++  {
++    const T *values_row = data + row * row_size;
++    topc.start_collecting(values_row);
++    for (int32 c = 0; c < row_size; ++c)
++    {
++      topc.push(c);
++    }
++
++    // Prepare output buffers.
++    int32 *indexes_row = output_indexes + row * k;
++    T *output_row = output_values + row * k;
++    // We always assume that the output is sorted.
++    const auto &top_k = topc.sorted_result();
++    std::copy(top_k.begin(), top_k.end(), indexes_row);
++    std::transform(top_k.begin(), top_k.end(), output_row,
++                   [values_row](const int32 loc) { return values_row[loc]; });
++  }
++}
++
++} // namespace optimized_ops
++} // namespace rt
++} // namespace nnfw
++
++#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+-- 
+1.9.1
+