27 files changed, 1206 insertions, 276 deletions
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h
new file mode 100644
index 000000000..7e578550f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLActivationLayerExKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class CLActivationLayerEx : public ICLSimpleFunction
+{
+public:
+  /** Set the input and output tensor.
+   *
+   * @note If the output tensor is a nullptr or is equal to the input, the activation function will
+   * be performed in-place
+   *
+   * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will
+   * store the result
+   *                          of the activation function. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out]     output   Destination tensor. Data type supported: same as @p input
+   * @param[in]      act_info Activation layer parameters.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLActivationLayer
+   *
+   * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor
+   * will store the result
+   *                     of the activation function. Data types supported: QASYMM8/F16/F32.
+   * @param[in] output   Destination tensor info. Data type supported: same as @p input
+   * @param[in] act_info Activation layer information.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ActivationLayerInfoEx &act_info);
+};
+}
+#endif /* __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h
new file mode 100644
index 000000000..8044c58af
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgMinMax.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLArgMinMax class
+ */
+
+#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_H__
+#define __ARM_COMPUTE_CLARG_MIN_MAX_H__
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to execute CLArgMinMax operation
+ */
+class CLArgMinMax : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new CLArgMinMax object
+   */
+  CLArgMinMax();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLArgMinMax(const CLArgMinMax &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLArgMinMax &operator=(const CLArgMinMax &) = delete;
+
+  /**
+   * @brief Construct a new CLArgMinMax object by using copy constructor
+   * @param[in] CLArgMinMax object to move
+   */
+  CLArgMinMax(CLArgMinMax &&) = default;
+
+  /**
+   * @brief Assign a CLArgMinMax object.
+   * @param[in] CLArgMinMax object to assign. This object will be moved.
+   */
+  CLArgMinMax &operator=(CLArgMinMax &&) = default;
+
+  /**
+   * @brief Initialise the kernel's inputs and outputs.
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[out] output    The result of argminmaxMax operation. Data types supported: same as @p
+   * input.
+   * @param[in]  axis      Axis to argminmax. It must be sorted and no duplicates.
+   * @param[in] is_min     True for ArgMin operation.
+   * @param[in] is_max     Ture for ArgMax operation.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> argminmax_axis,
+                 ArgOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  axis      Axis to argminmax
+   * @param[out] output    The result of argminmaxMax operation. Data types supported: same as @p
+   * input.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis,
+                         const ITensorInfo *output, ArgOperation op);
+
+  /**
+   * @brief Run the kernels contained in the function
+   * This operation works on CPU on GPU depending on the value of argminmax_MAX_RUN_ON_CPU macro
+   * in CLArgMinMax.cpp.
+   * If argminmax_MAX_RUN_ON_CPU == 1, CPU runs this operation.
+   * Otherwise GPU runs this operation.
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_output;
+  std::vector<uint32_t> _argminmax_axis;
+  ArgOperation _arg_op;
+
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLArgMinMaxKernel[]> _argminmax_kernels{nullptr};
+  size_t _num_of_kernels;
+};
+}
+#endif /*__ARM_COMPUTE_CLargminmax_MAX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h
new file mode 100644
index 000000000..34e6c6334
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLArithmeticSubtractionExKernel
+ *
+ * @note The tensor data type for the inputs must be U8/S16/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class CLArithmeticSubtractionEx : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and convertion policy.
+   *
+   * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32.
+   *                        The input tensor is [in, out] because its TensorInfo might be modified
+   * inside the kernel in case of broadcasting of dimension 0.
+   * @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
+   *                        The input tensor is [in, out] because its TensorInfo might be modified
+   * inside the kernel in case of broadcasting of dimension 0.
+   * @param[out]     output Output tensor. Data types supported: U8 (Only if both inputs are U8),
+   * S16/F16/F32.
+   * @param[in]      policy Policy to use to handle overflow.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLArithmeticSubtractionEx
+   *
+   * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32.
+   * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32.
+   * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8),
+   * S16/F16/F32.
+   * @param[in] policy Policy to use to handle overflow.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
new file mode 100644
index 000000000..d16a0762d
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBatchToSpaceNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLBatchToSpaceND : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  block_size         A pointer to an array of integer values specifying block sizes
+   *                                for spatial dimension.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
new file mode 100644
index 000000000..061e34f26
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLBinaryLogicalOp : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input1  Source tensor1. Data types supported: U8, QASYMM8.
+   * @param[in]  input2  Source tensor2. Data types supported: U8 QASYMM8.
+   * @param[out] output Output tensor. Data types supported: U8, QASYMM8.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                 BinaryLogicalOperation op);
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
index 63050067d..56b8408e2 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
@@ -14,30 +14,35 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLCast.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLCast class
+ */
+
 #ifndef __ARM_COMPUTE_CLCAST_H__
 #define __ARM_COMPUTE_CLCAST_H__
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to run @ref CLCastKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
+/**
+ * @brief Class to run @ref CLCastKernel.
+ * This converts the input tensor to the tensor of the output tensor's type.
  */
 class CLCast : public ICLSimpleFunction
 {
 public:
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   *                       The input tensor is [in, out] because its TensorInfo might be modified
-   * inside the kernel.
-   * @param[out]     output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+  /**
+   * @brief Initialise the kernel's input and output
+   * @param[in, out] input    Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   *                          The input tensor is [in, out] because its TensorInfo might be
+   *                          modified inside the kernel.
+   * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
    */
   void configure(ICLTensor *input, ICLTensor *output);
 };
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h
new file mode 100644
index 000000000..1b0d70e7f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_H__
+#define __ARM_COMPUTE_CLCOMPARISON_OP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLComparisonOp : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input1  Source tensor1. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in]  input2  Source tensor2. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                 const ComparisonOperation &op);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
new file mode 100644
index 000000000..d78a6ada4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLDepthToSpaceKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLDepthToSpace : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[block_size] block size  integer only
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+} // namesace arm_compute
+
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
new file mode 100644
index 000000000..257772a89
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class CLEmbeddingLookup : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h
new file mode 100644
index 000000000..2d0fc23a4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLEXP_H__
+#define __ARM_COMPUTE_CLEXP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLExpKernel */
+class CLExp : public ICLSimpleFunction
+{
+public:
+  /** Set the source, destination of the kernel
+   *
+   * @param[in]  input  Source tensor. Data type supported: F32.
+   * @param[out] output Destination tensor. Data type supported: F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLEXP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
index 3ae7afe14..f7fd3cda1 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
@@ -14,32 +14,43 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLGather.h
+ * @brief       This file contains CLGather class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
 #ifndef __ARM_COMPUTE_CLGATHER_H__
 #define __ARM_COMPUTE_CLGATHER_H__
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to run @ref CLGatherKernel. */
+/**
+ * @brief Class to to run @ref CLGatherKernel.
+ */
 class CLGather : public ICLSimpleFunction
 {
 public:
-  /** Initialise the kernel's inputs, output and convertion policy.
-   *
-   * @param[in] input1          An input tensor. Data types supported: U8/S32/F32.
-   * @param[in] input2          An indexes tensor. Data types supported: S32.
-   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
-   */
+  /**
+   * @brief Initialise the kernel's inputs, output and convertion policy.
+   * @param[in]  input1   An input tensor. Data types supported: U8/S32/F32.
+   * @param[in]  input2   An indexes tensor. Data types supported: S32.
+   * @param[out] output   The output tensor, Data types supported: same as @p input1.
+   * @return N/A
+ */
   void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref CLGather
-   *
-   * @param[in] input1          An input tensor. Data types supported: U8/S32/F32.
-   * @param[in] input2          An indexes tensor. Data types supported: S32.
-   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration
+   *        of @ref CLGather
+   * @param[in]  input1   An input tensor. Data types supported: U8/S32/F32.
+   * @param[in]  input2   An indexes tensor. Data types supported: S32.
+   * @param[out] output   The output tensor, Data types supported: same as @p input1.
    * @return a status
    */
   static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
new file mode 100644
index 000000000..65aa6cbd5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class CLHashtableLookup : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
+                 ICLTensor *output, ICLTensor *hits);
+};
+}
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
new file mode 100644
index 000000000..198a0fd4e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEG_H__
+#define __ARM_COMPUTE_CLNEG_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLNeg : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input  Source tensor. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   *
+   */
+  void configure(ICLTensor *input, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEG_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h
new file mode 100644
index 000000000..4077245d5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute a normalization layer. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel
+ * -# @ref CLNormalizationLayerKernelEx
+ *
+ */
+class CLNormalizationLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  CLNormalizationLayerEx();
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input     Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                           and an optional 4th dimension for batch of inputs. Data types
+   * supported: F16/F32 (Written to by the border handler)
+   * @param[out]     output    Destination tensor. Dimensions, data type and number of channels must
+   * match the input ones.
+   * @param[in]      norm_info Normalization layer information like the normalization type,
+   * normalization size and other parameters.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLNormalizationLayer
+   *
+   * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                      and an optional 4th dimension for batch of inputs. Data types supported:
+   * F16/F32
+   * @param[in] output    Destination tensor. Dimensions, data type and number of channels must
+   * match the input ones.
+   * @param[in] norm_info Normalization layer information like the normalization type, normalization
+   * size and other parameters.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const NormalizationLayerInfo &norm_info);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  CLNormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel to run */
+  CLFillBorderKernel _border_handler;        /**< Kernel to handle  borders */
+};
+}
+#endif /* __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
new file mode 100644
index 000000000..622a61b5e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_H__
+#define __ARM_COMPUTE_CLPRELU_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLPReLU : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[in]  alpha. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_H__*/
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
new file mode 100644
index 000000000..d6ea486d1
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -0,0 +1,47 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYEREX_H__
+#define __ARM_COMPUTE_CLPADLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPadLayerKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLPadLayerEx : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]           input     Input tensor. Data types supported:
+   *                                U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out]          output    Output tensor. Data types supported:
+   *                                U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]           pad_size  Tensor for Padding values in NHWC format shape [n, 2],
+   *                                where n is the rank of tensor . Data types supported: S32
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h
new file mode 100644
index 000000000..9a0cc213c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPERMUTEEX_H__
+#define __ARM_COMPUTE_CLPERMUTEEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute an @ref CLPermuteKernel. */
+class CLPermuteEx : public ICLSimpleFunction
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in] input  The input tensor to permute. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+   * @param[in] output The output tensor. Data types supported: Same as @p input
+   * @param[in] perm   Permutation vector
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+  /**  Static function to check if given info will lead to a valid configuration of @ref CLPermute.
+   *
+   * @param[in] input  First tensor input info. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output Output tensor info. Data types supported: same as @p input.
+   * @param[in] perm   Permutation vector
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PermutationVector &perm);
+};
+}
+#endif /*__ARM_COMPUTE_CLPERMUTEEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
index c1383e21f..b142d3a2e 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
@@ -14,53 +14,61 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLPixelWiseDivision.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLPixelWiseDivision class
+ */
 #ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
 #define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to run @ref CLPixelWiseDivisionKernel. */
+/**
+ * @brief Class to run @ref CLPixelWiseDivisionKernel.
+ */
 class CLPixelWiseDivision : public ICLSimpleFunction
 {
 public:
-  /** Initialise the kernel's inputs, output and convertion policy.
-   *
-   * @param[in, out] input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+  /**
+   * @brief Initialise the kernel's inputs, output and convertion policy.
+   * @param[in, out] input1          An input tensor. Data types supported: U8/S16/F16/F32
    *                                 The input tensor is [in, out] because its TensorInfo might be
    * modified inside the kernel in case of broadcasting of dimension 0.
    * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
    *                                 The input tensor is [in, out] because its TensorInfo might be
    * modified inside the kernel in case of broadcasting of dimension 0.
    * @param[out]     output          The output tensor, Data types supported: same as @p input1.
-   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * Note: U8 requires both inputs to be U8.
    * @param[in]      scale           Scale to apply after multiplication.
    *                                 Scale must be positive and its value must be either 1/255 or
-   * 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * 1/2^n where n is between 0 and 15.
    * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
    * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
    * even.
+   * @return N/A
    */
   void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
                  ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
                  RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
-  /** Static function to check if given info will lead to a valid configuration of @ref
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
    * CLPixelWiseDivision
-   *
-   * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in] input1          An input tensor info. Data types supported: U8/S16/F16/F32
    * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
    * @param[in] output          The output tensor info, Data types supported: same as @p input1.
-   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * Note: U8 requires both inputs to be U8.
    * @param[in] scale           Scale to apply after multiplication.
    *                            Scale must be positive and its value must be either 1/255 or 1/2^n
-   * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * where n is between 0 and 15.
    * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
    * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-   *
    * @return a status
    */
   static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
deleted file mode 100644
index 14b473f33..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
-#define __ARM_COMPUTE_CLREDUCE_MAX_H__
-
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLTopKV2Kernel
- */
-class CLReduceMax : public IFunction
-{
-public:
-  /** Constructor */
-  CLReduceMax();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLReduceMax(const CLReduceMax &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLReduceMax &operator=(const CLReduceMax &) = delete;
-  /** Allow instances of this class to be moved */
-  CLReduceMax(CLReduceMax &&) = default;
-  /** Allow instances of this class to be moved */
-  CLReduceMax &operator=(CLReduceMax &&) = default;
-  /** Initialise the kernel's inputs and outputs.
-   *
-   * @note When locations of min and max occurrences are requested, the reported number of locations
-   * is limited to the given array size.
-   *
-   * @param[in]  input     Input image. Data types supported: F32
-   * @param[in]  axis      Axis to reduce. Data type supported: S32
-   * @param[out] output    indices related to top k values. Data types supported: F32.
-   */
-  void configure(ICLTensor *input, int32_t axis, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLPixelWiseDivision
-   *
-   * @param[in]  input     Input image. Data types supported: F32
-   * @param[in]  axis      Axis to reduce. Data type supported: S32
-   * @param[out] output    indices related to top k values. Data types supported: F32.     *
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  void run_on_cpu();
-
-  int32_t _axis;
-
-  ICLTensor *_input;
-  ICLTensor *_output;
-
-  std::unique_ptr<ICLKernel> _kernel;
-};
-}
-#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
new file mode 100644
index 000000000..e1a6f6ab4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperation.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLReduceOperation class
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATION_H__
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform ReduceOperation
+ */
+class CLReduceOperation : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new ReduceOperation object
+   */
+  CLReduceOperation();
+
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor. Data types supported: U8/S32/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[in]  axis     Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in]  op       Reduce operation to perform.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis,
+                 ReduceOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLReduceOperation.
+   * @param[in] input   Source tensor info. Data types supported: U8/S32/F32
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: Same as @p
+   *                    input.
+   * @param[in] axis    Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in] op      Reduce operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const std::set<uint32_t> &axis, const ReduceOperation &op);
+
+  /**
+   * @brief Run the OpenCL kernel for this operation
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_output;
+  std::set<uint32_t> _axis;
+
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
deleted file mode 100644
index 2081518c1..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__
-#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__
-
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Perform reduction operation.
- */
-class CLReductionMean : public IFunction
-{
-public:
-  /** Default Constructor.
-   */
-  CLReductionMean();
-
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
-   * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
-   * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1
-   */
-  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLReductionMean.
-   *
-   * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
-   * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
-   * input.
-   * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         std::vector<uint32_t> axis);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  CLReductionMeanKernel _reduction_mean_kernel;
-  CLFillBorderKernel _fill_border_kernel;
-};
-}
-#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
new file mode 100644
index 000000000..7e2df8986
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToBatchNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32.
+ * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape
+ * block_shape, and interleaves these blocks with the "batch" dimension such that in the output.
+ */
+class CLSpaceToBatchND : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
+   * @param[in]  input          Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
+   * @param[in]  block_size     Tensor of integer values specifying block sizes for spatial
+   * dimension.
+   *                            Data types supported: S32
+   * @param[in]  padding_size   Tensor of integer values specifying padding sizes for spatial
+   * dimension.
+   *                            Data types supported: S32
+   * @param[out] output         Output tensor. Data types supported: same as @p input.
+   *                            Data layout supported: NCHW/NHWC
+   */
+  void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+                 ICLTensor *output);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
new file mode 100644
index 000000000..17f762092
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
+#define __ARM_COMPUTE_CLSPACETODEPTH_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToDepthKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLSpaceToDepth : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[block_size] block size  integer only
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h
new file mode 100644
index 000000000..3610ba71c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__
+#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLSquaredDifference : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input1  Source tensor1. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in]  input2  Source tensor2. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__*/
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
deleted file mode 100644
index f223a79be..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__
-
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLStridedSliceKernel */
-class CLStridedSlice : public ICLSimpleFunction
-{
-public:
-  /** Initialise the kernel's inputs and outputs
-   *
-   * @param[in]  input  First tensor input. Data type supported:
-   * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
-   * @param[out] output Output tensor. Data type supported: Same as @p input
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                 int32_t shrinkAxisMask);
-};
-
-class CLStridedSliceCPU : public IFunction
-{
-public:
-  /** Initialise inputs and outputs
-   *
-   * @param[in]  input  First tensor input.
-   * @param[out] output Output tensor.
-   */
-  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData,
-                 ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                 int32_t shrinkAxisMask);
-
-  void run() override;
-
-private:
-  void run_on_cpu();
-
-  ICLTensor *_input;
-  ICLTensor *_output;
-  ICLTensor *_beginData;
-  ICLTensor *_endData;
-  ICLTensor *_stridesData;
-  int32_t _beginMask;
-  int32_t _endMask;
-  int32_t _shrinkAxisMask;
-};
-}
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
new file mode 100644
index 000000000..6b26a85c8
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLStridedSlice.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
+ */
+
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLStridedSliceKernel
+ */
+class CLStridedSliceEx : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Initialise the kernel's inputs and outputs
+   * @param[in]  input   Tensor input. Data type supported:
+   *                     U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output  Output tensor. Data type supported: Same as @p input
+   * @param[in]  beginData 'begin' vector of strided slice operation
+   * @param[in]  endData   'end' vector of strided slice operation
+   * @param[in]  stridesData 'strides' vector of strided slice operation
+   * @param[in]  beginMask  If the ith bit is set, begin[i] is ignored
+   * @param[in]  endMask    If the ith bit is set, end[i] is ignored
+   * @param[in]  shrinkAxisMask  If the ith bit is set, the ith specification shrinks the
+   *                             dimensionality by 1, taking on the value at index begin[i]
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                 int32_t shrinkAxisMask);
+};
+}
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
index 06cd1ee9b..5327e016f 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -14,51 +14,79 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLTopKV2.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLTopKV2 class
+ */
 #ifndef __ARM_COMPUTE_CLTOPK_V2_H__
 #define __ARM_COMPUTE_CLTOPK_V2_H__
 
 #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
 
-#include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLTopKV2Kernel
+/**
+ * @brief Class to execute TopKV2 operation.
  */
 class CLTopKV2 : public IFunction
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Construct a new CLTopKV2 object
+   */
   CLTopKV2();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
   CLTopKV2(const CLTopKV2 &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
   CLTopKV2 &operator=(const CLTopKV2 &) = delete;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Construct a new CLTopKV2 object by using copy constructor
+   * @param[in] CLTopKV2 object to move
+   */
   CLTopKV2(CLTopKV2 &&) = default;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Assign a CLTopKV2 object.
+   * @param[in] CLTopKV2 object to assign. This object will be moved.
+   */
   CLTopKV2 &operator=(CLTopKV2 &&) = default;
-  /** Initialise the kernel's inputs and outputs.
-   *
-   * @note When locations of min and max occurrences are requested, the reported number of locations
-   * is limited to the given array size.
-   *
+
+  /**
+   * @brief Initialise the kernel's inputs and outputs.
    * @param[in]  input     Input image. Data types supported: U8/S16/F32.
    * @param[in]  k         The value of `k`.
    * @param[out] values    Top k values. Data types supported: S32 if input type is U8/S16, F32 if
    * input type is F32.
-   * @param[out] indices   indices related to top k values. Data types supported: S32 if input type
+   * @param[out] indices   Indices related to top k values. Data types supported: S32 if input type
    * is U8/S16, F32 if input type is F32.
+   * @return N/A
    */
   void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
                  int total_bits = 32, int bits = 4);
 
-  // Inherited methods overridden:
+  /**
+   * @brief Run the kernels contained in the function
+   * Depending on the value of the following environment variables it works differently:
+   *   - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE",
+   *     quick sort on GPU is used.
+   *   - If the value of environment variable "ACL_TOPKV2" == ""GPU"",
+   *     radix sort on GPU is used.
+   *   - For other value, TopKV2 runs on CPU
+   * @return N/A
+   */
   void run() override;
 
 private: