Imported Upstream version 0.4upstream/0.4

author: Chunseok Lee <chunseok.lee@samsung.com> 2020-10-29 13:12:50 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2020-10-29 13:12:50 +0900
commit: d6b371e095d737922187a518b8faba1ef6f3a2b1 (patch)
tree: 9d90c09c887b5111389dbedf924f59206411cd5a /compute/ARMComputeEx
parent: c55f8a6db48cda9d3a78048338b7f18c4cca62b8 (diff)
download: nnfw-d6b371e095d737922187a518b8faba1ef6f3a2b1.tar.gz
nnfw-d6b371e095d737922187a518b8faba1ef6f3a2b1.tar.bz2
nnfw-d6b371e095d737922187a518b8faba1ef6f3a2b1.zip
149 files changed, 0 insertions, 25060 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt
deleted file mode 100644
index 58f558db2..000000000
--- a/compute/ARMComputeEx/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-nnfw_find_package(ARMCompute QUIET)
-
-if(NOT ARMCompute_FOUND)
-  message(STATUS "Check ARM Compute library extension build: need ARM Compute library")
-  return()
-else(NOT ARMCompute_FOUND)
-  message(STATUS "Check ARM Compute library extension build: OK")
-endif(NOT ARMCompute_FOUND)
-
-set(ACL_EX_BASE ${CMAKE_CURRENT_SOURCE_DIR})
-
-file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp")
-
-# generate embeded cl_kernel
-execute_process (
-    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-    COMMAND bash -c "python resolve_includes.py"
-)
-
-add_library(arm_compute_ex SHARED ${ACL_EX_SRCS})
-target_include_directories(arm_compute_ex PUBLIC ${ACL_EX_BASE})
-target_link_libraries(arm_compute_ex PRIVATE arm_compute)
-target_link_libraries(arm_compute_ex PRIVATE nnfw_common)
-target_link_libraries(arm_compute_ex PRIVATE nnfw_coverage)
-# Defines to enable validate check in debug build
-target_compile_definitions(arm_compute_ex PRIVATE EMBEDDED_KERNELS
-                                                  $<$<CONFIG:Debug>:ARM_COMPUTE_DEBUG_ENABLED ARM_COMPUTE_ASSERTS_ENABLED
-                                                                    ARM_COMPUTE_LOGGING_ENABLED>)
-# Validate check functions are not used on release build
-# Some parameter are used for validate check function call, and these parameter may not used on release build
-# Because clang requires to add "-Wno-unused-parameter -Wno-unused-function" after "-Wall",
-# this should be after linking nnfw_common and use interface lib linking
-add_library(ignore_unused_warning INTERFACE)
-target_compile_options(ignore_unused_warning INTERFACE -Wno-unused-parameter -Wno-unused-function)
-target_link_libraries(arm_compute_ex PRIVATE $<$<NOT:$<CONFIG:Debug>>:ignore_unused_warning>)
-install(TARGETS arm_compute_ex DESTINATION lib)
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
deleted file mode 100644
index d29886a9d..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file      CLKernelLibraryEx.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file is a cloned version of CLKernelLibrary.h in ACL. This file defines
- *            an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL.
- */
-
-#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
-#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
-
-#include "arm_compute/core/CL/OpenCL.h"
-
-#include <map>
-#include <set>
-#include <string>
-#include <utility>
-
-namespace arm_compute
-{
-
-/**
- * @brief Class to build OpenCL kernels added from nnfw
- * */
-class CLKernelLibraryEx
-{
-  using StringSet = std::set<std::string>;
-
-private:
-  /**
-   * @brief Construct a new CLKernelLibraryEx object
-   */
-  CLKernelLibraryEx();
-
-public:
-  /**
-   * @brief Prevent instances of this class from being copied.
-   */
-  CLKernelLibraryEx(const CLKernelLibraryEx &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied.
-   */
-  const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete;
-
-  /**
-   * @brief Get the KernelLibrary singleton.
-   * @return The KernelLibrary instance
-   */
-  static CLKernelLibraryEx &get();
-
-  /**
-   * @brief Initialise the kernel library.
-   * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
-   * @param[in] context     CL context used to create programs.
-   * @param[in] device      CL device for which the programs are created.
-   * @return N/A
-   */
-  void init(std::string kernel_path, cl::Context context, cl::Device device)
-  {
-    _kernel_path = std::move(kernel_path);
-    _context = std::move(context);
-    _device = std::move(device);
-  }
-
-  /**
-   * @brief Set the path that the kernels reside in.
-   * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
-   * @return N/A
-   */
-  void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; };
-
-  /**
-   * @brief Get the path that the kernels reside in.
-   * @return the path of kernel files
-   */
-  std::string get_kernel_path() { return _kernel_path; };
-
-  /**
-   * @brief Get the source of the selected program.
-   * @param[in] program_name Program name.
-   * @return Source of the selected program.
-   */
-  std::string get_program_source(const std::string &program_name);
-
-  /**
-   * @brief Set the CL context used to create programs.
-   * @note Setting the context also resets the device to the
-   *       first one available in the new context.
-   * @param[in] context A CL context.
-   * @return N/A
-   */
-  void set_context(cl::Context context)
-  {
-    _context = std::move(context);
-    if (_context.get() == nullptr)
-    {
-      _device = cl::Device();
-    }
-    else
-    {
-      const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
-
-      if (cl_devices.empty())
-      {
-        _device = cl::Device();
-      }
-      else
-      {
-        _device = cl_devices[0];
-      }
-    }
-  }
-
-  /**
-   * @brief Return associated CL context.
-   * @return A CL context.
-   */
-  cl::Context &context() { return _context; }
-
-  /**
-   * @brief Set the CL device for which the programs are created.
-   * @param[in] device A CL device.
-   * @return N/A
-   */
-  void set_device(cl::Device device) { _device = std::move(device); }
-
-  /**
-   * @brief Gets the CL device for which the programs are created.
-   * @return A CL device.
-   */
-  cl::Device &get_device() { return _device; }
-
-  /**
-   * @brief Return the device version
-   * @return The content of CL_DEVICE_VERSION
-   */
-  std::string get_device_version();
-
-  /**
-   * @brief Create a kernel from the kernel library.
-   * @param[in] kernel_name       Kernel name.
-   * @param[in] build_options_set Kernel build options as a set.
-   * @return The created kernel.
-   */
-  Kernel create_kernel(const std::string &kernel_name,
-                       const StringSet &build_options_set = {}) const;
-
-  /**
-   * @brief Find the maximum number of local work items in a workgroup can be supported for the
-   * kernel.
-   * @param[in] kernel       kernel object
-   */
-
-  size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
-  /**
-   * @brief Return the default NDRange for the device.
-   * @return default NDRangeof the device
-   */
-  cl::NDRange default_ndrange() const;
-
-  /**
-   * @brief Clear the library's cache of binary programs
-   * @return N/A
-   */
-  void clear_programs_cache()
-  {
-    _programs_map.clear();
-    _built_programs_map.clear();
-  }
-
-  /**
-   * @brief Access the cache of built OpenCL programs
-   * @return program map data structure of which key is name of kernel and value is
-   *         kerel source name. (*.cl)
-   */
-  const std::map<std::string, cl::Program> &get_built_programs() const
-  {
-    return _built_programs_map;
-  }
-
-  /**
-   * @brief Add a new built program to the cache
-   * @param[in] built_program_name Name of the program
-   * @param[in] program            Built program to add to the cache
-   * @return N/A
-   */
-  void add_built_program(const std::string &built_program_name, cl::Program program);
-
-  /**
-   * @brief Returns true if FP16 is supported by the CL device
-   * @return true if the CL device supports FP16
-   */
-  bool fp16_supported() const;
-
-  /**
-   * @brief Returns true if int64_base_atomics extension is supported by the CL device
-   * @return true if the CL device supports int64_base_atomics extension
-   */
-  bool int64_base_atomics_supported() const;
-
-private:
-  /**
-   * @brief Load program and its dependencies.
-   * @param[in] program_name Name of the program to load.
-   */
-  const Program &load_program(const std::string &program_name) const;
-  /**
-   * @brief Concatenates contents of a set into a single string.
-   * @param[in] s Input set to concatenate.
-   * @return Concatenated string.
-   */
-  std::string stringify_set(const StringSet &s) const;
-
-  cl::Context _context;     /**< Underlying CL context. */
-  cl::Device _device;       /**< Underlying CL device. */
-  std::string _kernel_path; /**< Path to the kernels folder. */
-  mutable std::map<std::string, const Program>
-      _programs_map; /**< Map with all already loaded program data. */
-  mutable std::map<std::string, cl::Program>
-      _built_programs_map; /**< Map with all already built program data. */
-  static const std::map<std::string, std::string>
-      _kernel_program_map; /**< Map that associates kernel names with programs. */
-  static const std::map<std::string, std::string>
-      _program_source_map; /**< Contains sources for all programs.
-                                Used for compile-time kernel inclusion. >*/
-};
-}
-#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
deleted file mode 100644
index a0aa0560b..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
-#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reduction operation kernel
- *
- * @note The default data type for an uninitialized output tensor is
- *       signed 32-bit integer (S32). It is the user's responsibility to check
- *       that the results do not overflow because the indices are computed
- *       in unsigned 32-bit (U32).
- */
-class CLArgMinMaxLayerKernelEx : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLArgMinMaxLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default;
-  /** Default destructor */
-  ~CLArgMinMaxLayerKernelEx() = default;
-
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input       Source tensor. Data types supported: S32/F16/F32.
-   * @param[in]  prev_output Destination tensor of the previous iterations of @ref
-   * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32
-   *                         Has to be nullptr for the first iteration
-   * @param[out] output      Destination tensor. Data types supported: U32/S32
-   *                         Output will have the same number of dimensions as input.
-   * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-   * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
-   */
-  void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output,
-                 unsigned int axis, ReductionOperation op);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLArgMinMaxLayerKernelEx.
-   *
-   * @param[in] input       Source tensor info. Data types supported: S32/F16/F32.
-   * @param[in] prev_output Destination tensor info of the previous iterations. Data types
-   * supported: U32/S32
-   *                        Has to be nullptr for the first iteration
-   * @param[in] output      Destination tensor info. Data types supported: U32/S32
-   *                        Output will have the same number of dimensions as input.
-   * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-   * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output,
-                         const ITensorInfo *output, unsigned int axis, ReductionOperation op);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  const ICLTensor *_prev_output;
-  ICLTensor *_output;
-  unsigned int _reduction_axis;
-  ReductionOperation _op;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
deleted file mode 100644
index bb6fcb8f5..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
-#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/
-class CLBinaryLogicalOpKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLBinaryLogicalOpKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default;
-  /** Initialize the kernel's input, output.
-   *
-   * @param[in]  input1  Source tensor1.
-   * @param[in]  input2  Source tensor2.
-   * @param[out] output  Output tensor.
-   */
-  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
-                 BinaryLogicalOperation op);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-  BorderSize border_size() const override;
-
-private:
-  const ICLTensor *_input1;
-  const ICLTensor *_input2;
-  ICLTensor *_output;
-};
-
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
deleted file mode 100644
index ed668fd9c..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file      CLCastBoolKernel.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file defines CLCastBoolKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
-#define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
-
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class for the kernel converting boolean type
- */
-class CLCastBoolKernel : public ICLSimple3DKernel
-{
-public:
-  /**
-   * @brief Initialise the kernel's input and output.
-   * @param[in]  input  Input tensor. Data types supported: U8
-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLCastBoolKernel
-   *
-   * @param[in] input  Source tensor info. Data types supported: U8.
-   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
deleted file mode 100644
index a614d5259..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file      CLEmbeddingLookupKernel.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file defines CLEmbeddingLookupKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
-#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
-* @brief Class to perform EmbeddingLookup operation with opencl kernel
-*/
-class CLEmbeddingLookupKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Construct a CLEmbeddingLookupKernel object
-   * */
-  CLEmbeddingLookupKernel();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   * */
-  CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   * */
-  CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete;
-
-  /**
-   * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor
-   * @param[in] CLEmbeddingLookupKernel object to move
-   * */
-  CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default;
-
-  /**
-   * @brief Move assignment operator
-   * @param[in] CLEmbeddingLookupKernel object to move
-   * */
-  CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default;
-
-  /**
-   * @brief Destruct this object
-   * */
-  ~CLEmbeddingLookupKernel() = default;
-
-  /**
-   * @brief Set the input and output of the kernel
-   * @param[in]  input          Source tensor.
-   *                            Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output         Destination tensor. Data type supported: Same as @p input
-   * @param[in]  lookups        Lookups are 1D tensor that values are indices into the first
-   *                            dimension of input.
-   *                            Data types supported: S32.
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   *        CLEmbeddingLookupKernel
-   * @param[in]  input          The input tensor info.
-   *                            Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[in]  output         The output tensor info, Data types supported: same as @p input1.
-   * @param[in]  lookups        Lookups info. Data types supported: S32.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *lookups);
-
-  /**
-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
-   *        queue.
-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
-   *        been executed by the time this method returns.
-   * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of
-   *                        the window returned by window()).
-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;   /** Source tensor */
-  ICLTensor *_output;        /** Destination tensor */
-  const ICLTensor *_lookups; /** Lookups tensor */
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
deleted file mode 100644
index 6630c7be7..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file      CLGatherExKernel.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file defines CLGatherExKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__
-#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define an interface for the gather kernel.
- */
-class CLGatherExKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Construct CLGatherExKernel object
-   * */
-  CLGatherExKernel();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   */
-  CLGatherExKernel(const CLGatherExKernel &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   */
-  CLGatherExKernel &operator=(const CLGatherExKernel &) = delete;
-
-  /**
-   * @brief Construct CLGatherExKernel object by using default move constructor
-   * @param[in] CLGatherExKernel object to move
-   */
-  CLGatherExKernel(CLGatherExKernel &&) = default;
-
-  /**
-   * @brief Move assignment operator
-   * @param[in] CLGatherExKernel object to move
-   */
-  CLGatherExKernel &operator=(CLGatherExKernel &&) = default;
-
-  /**
-   * @brief Initialise the kernel's input, output and border mode.
-   * @param[in]  input           An input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[in]  indices         Indices tensor. Data types supported: S32.
-   * @param[out] output          The output tensor, Data types supported: same as @p input1.
-   * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative
-   * values wrap around. Defaults to 0
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   * CLGatherExKernel
-   * @param[in]  input           An input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[in]  indices         Indices tensor. Data types supported: S32.
-   * @param[out] output          The output tensor, Data types supported: same as @p input1.
-   * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative
-   * values wrap around. Defaults to 0
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
-                         const ITensorInfo *output, int axis = 0);
-
-  /**
-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
-   *        queue.
-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
-   *        been executed by the time this method returns.
-   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
-   *                        the window returned by window()).
-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  const ICLTensor *_indices;
-  ICLTensor *_output;
-  int _axis;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGATHEREXKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
deleted file mode 100644
index 99cfa61ec..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file      CLHashtableLookupKernel.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file defines CLHashtableLookupKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
-#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
-* @brief Class to perform HashtableLookup operation with opencl kernel
-*/
-class CLHashtableLookupKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Construct a CLHashtableLookupKernel object
-   * */
-  CLHashtableLookupKernel();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   * */
-  CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   * */
-  CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete;
-
-  /**
-   * @brief Construct a CLHashtableLookupKernel object by using default move constructor
-   * @param[in] CLHashtableLookupKernel object to move
-   * */
-  CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default;
-
-  /**
-   * @brief Move assignment operator
-   * @param[in] CLHashtableLookupKernel object to move
-   * */
-  CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default;
-
-  /**
-   * @brief Destruct this object
-   * */
-  ~CLHashtableLookupKernel() = default;
-
-  /**
-   * @brief Set the input and output of the kernel
-   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
-   *                      input.
-   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
-   *                      Data types supported: S32
-   * @param[in]  input    Source tensor.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
-   *                      input.
-   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
-   *                      (True) or not (False). Data types supported: U8/QASYMM8
-   * @return N/A
-   */
-  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input,
-                 ICLTensor *output, ICLTensor *hits);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   *        CLHashtableLookupKernel
-   * @param[in]  lookups  The lookups tensor info. Data types supported: S32.
-   * @param[in]  keys     The keys tensor info. keys and input pair represent a map.
-   *                      Data types supported: S32
-   * @param[in]  input    The input tensor info.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output   The output tensor. Data types and data layouts supported: Same as @p
-   *                      input.
-   * @param[out] hits     The hits tensor info. A boolean tensor that indicates whether the lookup
-   *                      hits
-   *                      (True) or not (False). Data types supported: U8/QASYMM8
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
-                         const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *hits);
-
-  /**
-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
-   *        queue.
-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
-   *        been executed by the time this method returns.
-   * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of
-   *                        the window returned by window()).
-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_lookups{nullptr};                 /** Lookups tensor */
-  const ICLTensor *_keys{nullptr};                    /** Keys tensor */
-  const ICLTensor *_input{nullptr};                   /** Source tensor */
-  ICLTensor *_output{nullptr};                        /** Destination tensor */
-  ICLTensor *_hits{nullptr};                          /** Hits tensor */
-  std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
deleted file mode 100644
index f57e799ad..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
-#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for performing an instance normalization */
-class CLInstanceNormalizationLayerKernelEx : public ICLKernel
-{
-public:
-  /** Constructor */
-  CLInstanceNormalizationLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLInstanceNormalizationLayerKernelEx &
-  operator=(const CLInstanceNormalizationLayerKernelEx &) = delete;
-  /** Default Move Constructor. */
-  CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default;
-  /** Default move assignment operator */
-  CLInstanceNormalizationLayerKernelEx &
-  operator=(CLInstanceNormalizationLayerKernelEx &&) = default;
-  /** Default destructor */
-  ~CLInstanceNormalizationLayerKernelEx() = default;
-
-  /** Set the input and output tensors.
-   *
-   * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported:
-   * NCHW
-   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in]      gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults
-   * to nullptr
-   * @param[in]      beta    (Optional) The offset tensor applied to the normalized tensor. Defaults
-   * to nullptr
-   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-   */
-  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr,
-                 ICLTensor *beta = nullptr, float epsilon = 1e-12f);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLInstanceNormalizationLayerEx.
-   *
-   * @param[in] input   Source tensor info. In case of @p output tensor = nullptr this tensor will
-   * store the result of the normalization.
-   *                    Data types supported: F16/F32. Data layout supported: NHWC, NCHW
-   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in] gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults to
-   * nullptr
-   * @param[in] beta    (Optional) The offset tensor applied to the normalized tensor. Defaults to
-   * nullptr
-   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
-                         float epsilon = 1e-12f);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  ICLTensor *_input;
-  ICLTensor *_output;
-  ICLTensor *_gamma;
-  ICLTensor *_beta;
-  float _epsilon;
-  bool _run_in_place;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
deleted file mode 100644
index 90e8b5705..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
-#define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface to multiply scale factor kernel. */
-class CLMultiplyScaleFactorKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLMultiplyScaleFactorKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLMultiplyScaleFactorKernel(const CLMultiplyScaleFactorKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLMultiplyScaleFactorKernel &operator=(const CLMultiplyScaleFactorKernel &) = delete;
-  /** Default Move Constructor. */
-  CLMultiplyScaleFactorKernel(CLMultiplyScaleFactorKernel &&) = default;
-  /** Default move assignment operator */
-  CLMultiplyScaleFactorKernel &operator=(CLMultiplyScaleFactorKernel &&) = default;
-  /** Default destructor */
-  ~CLMultiplyScaleFactorKernel() = default;
-  /** Set input, output tensors.
-   *
-   * @param[in/out] input  Source tensor. Data type supported: S32.
-   * @param[in]     scale_factor Scale tensor. Data type supported: F16/F32.
-   * @param[out]    output Destination tensor. Data type supported: Same as @p scale_factor.
-   * @param[in]     multiplier Additional scale value.
-   */
-  void configure(const ICLTensor *input, const ICLTensor *scale_factor, ICLTensor *output,
-                 float multiplier = 1.f);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLMultiplyScaleFactorKernel
-   *
-   * @param[in] input  Input tensor info. Data types supported: S32.
-   * @param[in] scale_factor Scale tensor. Data type supported: F16/F32.
-   * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor.
-   * @param[in] multiplier Additional scale value.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor,
-                         const ITensorInfo *output);
-
-  /**
-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
-   *        queue.
-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
-   *        been executed by the time this method returns.
-   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
-   *                        the window returned by window()).
-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  const ICLTensor *_scale_factor;
-  ICLTensor *_output;
-  float _multiplier;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
deleted file mode 100644
index fa383c0d0..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
-#define __ARM_COMPUTE_CLNEGKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a negation operation on tensor*/
-class CLNegKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLNegKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLNegKernel(const CLNegKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLNegKernel &operator=(const CLNegKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLNegKernel(CLNegKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLNegKernel &operator=(CLNegKernel &&) = default;
-  /** Initialize the kernel's input, output.
-   *
-   * @param[in]  input  Source tensor.
-   * @param[out] output Destination tensor.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
deleted file mode 100644
index a512057b9..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__
-#define __ARM_COMPUTE_CLONEHOTKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for the kernel to perform one-hot encoding*/
-class CLOneHotKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLOneHotKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLOneHotKernel(const CLOneHotKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLOneHotKernel &operator=(const CLOneHotKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLOneHotKernel(CLOneHotKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLOneHotKernel &operator=(CLOneHotKernel &&) = default;
-  /** Default destructor */
-  ~CLOneHotKernel() = default;
-  /** Initialise the kernel's inputs and output
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
-   * Same as @p on_value
-   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  depth     The depth of the one hot dimension.
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * value must be in range [-indices.rank , indices.rank)
-   */
-  void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value,
-                 ICLTensor *output, int depth, int axis = -1);
-  /** Initialise the kernel's inputs and output already initialized to off_value
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  depth     The depth of the one hot dimension.
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * value must be in range [-indices.rank , indices.rank)
-   */
-  void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, int depth,
-                 int axis = -1);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLOneHotKernel
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
-   * Same as @p on_value
-   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  depth     The depth of the one hot dimension.
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * value must be in range [-indices.rank , indices.rank)
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
-                         const ITensorInfo *off_value, const ITensorInfo *output, int depth,
-                         int axis = -1);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLOneHotKernel without off_value
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  depth     The depth of the one hot dimension.
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * value must be in range [-indices.rank , indices.rank)
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
-                         const ITensorInfo *output, int depth, int axis = -1);
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  /** Initialise the kernel's inputs and outputs internally
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  depth     The depth of the one hot dimension.
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * value must be in range [-indices.rank , indices.rank)
-   */
-  void configure_common(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
-                        int depth, int axis);
-
-private:
-  const ICLTensor *_indices;   /**< Indices tensor */
-  const ICLTensor *_on_value;  /**< On value tensor */
-  const ICLTensor *_off_value; /**< Off value tensor */
-  ICLTensor *_output;          /**< Destination tensor */
-  bool _is_off_value_memset;   /**< Whether off_value is zero */
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
deleted file mode 100644
index 4e1b56cba..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
-#define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 2D input tensors.
- */
-class CLQuantizationSymmetricKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLQuantizationSymmetricKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLQuantizationSymmetricKernel(const CLQuantizationSymmetricKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLQuantizationSymmetricKernel &operator=(const CLQuantizationSymmetricKernel &) = delete;
-  /** Default Move Constructor. */
-  CLQuantizationSymmetricKernel(CLQuantizationSymmetricKernel &&) = default;
-  /** Default move assignment operator */
-  CLQuantizationSymmetricKernel &operator=(CLQuantizationSymmetricKernel &&) = default;
-  /** Default destructor */
-  ~CLQuantizationSymmetricKernel() = default;
-  /** Set the input, output.
-   *
-   * @param[in]  input  Source tensor. Data types supported: F32/F16.
-   * @param[in] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
-   * @param[out] output Destination tensor with the same dimensions of input. Data types supported:
-   * S8.
-   *
-   * @note Output auto initialization is not supported by this kernel
-   */
-  void configure(const ICLTensor *input, const ICLTensor *scale_factor, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLQuantizationSymmetricKernel
-   *
-   * @param[in] input  Input tensor info. Data types supported: F32/F16.
-   * @param[in] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
-   * @param[in] output Destination tensor info with the same dimensions of input. Data types
-   * supported: S8.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor,
-                         const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  const ICLTensor *_scale_factor;
-  ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
deleted file mode 100644
index 9b8a239d3..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLReduceOperationKernel.h
- * @brief This file defines CLReduceOperationKernel class
- * @ingroup COM_AI_RUNTIME
- */
-
-#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
-#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define interface for the reduce operation kernel
- */
-class CLReduceOperationKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Default constructor
-   */
-  CLReduceOperationKernel();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLReduceOperationKernel(const CLReduceOperationKernel &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   */
-  CLReduceOperationKernel(CLReduceOperationKernel &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   */
-  CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default;
-  /**
-   * @brief Default destructor
-   */
-  ~CLReduceOperationKernel() = default;
-
-  /**
-   * @brief Set the input and output tensors.
-   * @param[in]  input  Source tensor. Data types supported: U8/S32/F32.
-   * @param[out] output Destination tensor. Data types supported: Same as @p input.
-   *                    Output will have the same number of dimensions as input.
-   * @param[in]  axis   Axis along which to reduce.
-   * @param[in]  op     Reduce operation to perform.
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
-                 ReduceOperation op);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   *        CLReduceOperationKernel.
-   * @param[in] input  Source tensor info. Data types supported: U8/S32/F32.
-   * @param[in] output Destination tensor info. Data types supported: Same as @p input.
-   *                   Output will have the same number of dimensions as input.
-   * @param[in] axis   Axis along which to reduce.
-   * @param[in] op     Reduce operation to perform.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
-                         ReduceOperation op);
-
-  /*
-   * @brief Run CLReduceOperationKernel op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   CLQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-  uint32_t _axis;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
deleted file mode 100644
index 4d4478ece..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
-#define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor.
- */
-class CLScaleFactorSymm8Kernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLScaleFactorSymm8Kernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLScaleFactorSymm8Kernel(const CLScaleFactorSymm8Kernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLScaleFactorSymm8Kernel &operator=(const CLScaleFactorSymm8Kernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLScaleFactorSymm8Kernel(CLScaleFactorSymm8Kernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLScaleFactorSymm8Kernel &operator=(CLScaleFactorSymm8Kernel &&) = default;
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input  Input tensor with 2 dimensions. The first dimension will be interpreted as
-   * batches. Data types supported: F32.
-   * @param[out] output Output tensor with shape [batches] which stores the scale values for each 2D
-   * input tensor.
-   *                    The dimensions over the first must match the batched dimensions of the input
-   * tensor. Data types supported: F32.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLScaleFactorSymm8Kernel
-   *
-   * @param[in] input  Input tensor info.  Data types supported: F32.
-   * @param[in] output Output tensor info with shape [batches] which stores the scale values for
-   * each 2D input tensor.
-   *                   The dimensions over the first must match the batched dimensions of the input
-   * tensor. Data types supported: F32.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-  /** Resets global minimum and maximum
-   *
-   * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
-   */
-  void reset(cl::CommandQueue &queue);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
deleted file mode 100644
index aa4a14812..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+++ /dev/null
@@ -1,680 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLTopKV2Kernel.h
- * @brief This file defines classes for TopKV2Kernel
- * @ingroup COM_AI_RUNTIME
- */
-
-#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
-#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-// these parameters can be changed
-#define _ITEMS 16                          // number of items in a group
-#define _GROUPS 4                          // the number of virtual processors is _ITEMS * _GROUPS
-#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram
-#define PERMUT                             // store the final permutation
-////////////////////////////////////////////////////////
-
-// Disable GPU implementation
-// TODO Enable GPU implementation with verification, or remove code
-//      Invalid result on GPU
-#if 0
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define CLTopKV2Single
- */
-class CLTopKV2Single : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLTopKV2Single();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
-   */
-  CLTopKV2Single(const CLTopKV2Single &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
-   * @return Reference of this instance
-   */
-  CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
-   */
-  CLTopKV2Single(CLTopKV2Single &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
-   * @return Reference of this instance
-   */
-  CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[in] input An input tensor
-   * @param[in] topk_values Values of the top k predictions
-   * @param[in] topk_indices Indices of the top k predictions
-   * @param[in] indices Indices
-   * @param[in] temp_stack Temp stack
-   * @param[in] k K of the top k predictions
-   * @param[in] n Number times to quick-sort
-   * return N/A
-   */
-  void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
-                 cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n);
-
-  /*
-   * @brief Run CLTopKV2Single op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  ICLTensor *_input;
-  ICLTensor *_topk_values;
-  ICLTensor *_topk_indices;
-};
-
-/**
- * @brief Class to define CLTopKV2Init
- */
-class CLTopKV2Init : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLTopKV2Init();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
-   */
-  CLTopKV2Init(const CLTopKV2Init &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
-   * @return Reference of this instance
-   */
-  CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
-   */
-  CLTopKV2Init(CLTopKV2Init &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
-   * @return Reference of this instance
-   */
-  CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[in] input An input tensor
-   * @param[in] in_key_buf Buffer of input key
-   * @param[in] in_ind_buf Buffer of input index
-   * @param[in] n Number times to quick-sort
-   * return N/A
-   */
-  void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n);
-
-  /*
-   * @brief Run CLTopKV2Init op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  ICLTensor *_input;
-};
-
-/**
- * @brief Class to define CLRadixSortHistogram
- */
-class CLRadixSortHistogram : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLRadixSortHistogram();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
-   */
-  CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
-   * @return Reference of this instance
-   */
-  CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
-   */
-  CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
-   * @return Reference of this instance
-   */
-  CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[out] hist_buf Buffer of histogram
-   * @param[in] bits Number of bits to be used for radix sort
-   * @param[in] n Integer number size to sort
-   * return N/A
-   */
-  void configure(cl::Buffer *hist_buf, int bits, int n);
-
-  /**
-   * @brief Set pass
-   * @param[in] pass Passes made of in radix sort algorithm
-   * @param[in] in_key_buf Buffer of input key
-   * return N/A
-   */
-  void setPass(int pass, cl::Buffer *in_key_buf)
-  {
-    _pass = pass;
-    _in_key_buf = in_key_buf;
-  }
-
-  /*
-   * @brief Run CLRadixSortHistogram op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  int _pass;
-  cl::Buffer *_in_key_buf;
-};
-
-/**
- * @brief Class to define CLRadixSortScanHistogram
- */
-class CLRadixSortScanHistogram : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLRadixSortScanHistogram();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
-   */
-  CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
-   * @return Reference of this instance
-   */
-  CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
-   */
-  CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
-   * @return Reference of this instance
-   */
-  CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[out] hist_buf Buffer of histogram
-   * @param[out] glob_sum_buf Buffer of global sum
-   * @param[in] bits Number of bits to be used for radix sort
-   * return N/A
-   */
-  void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
-
-  /*
-   * @brief Run CLRadixSortScanHistogram op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/**
- * @brief Class to define CLRadixSortGlobalScanHistogram
- */
-class CLRadixSortGlobalScanHistogram : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLRadixSortGlobalScanHistogram();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
-   */
-  CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
-   * @return Reference of this instance
-   */
-  CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
-   */
-  CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
-   * @return Reference of this instance
-   */
-  CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[out] glob_sum_buf Buffer of global sum
-   * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram
-   * @param[in] bits Number of bits to be used for radix sort
-   * return N/A
-   */
-  void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits);
-
-  /*
-   * @brief Run CLRadixSortGlobalScanHistogram op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/**
- * @brief Class to define CLRadixSortPasteHistogram
- */
-class CLRadixSortPasteHistogram : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLRadixSortPasteHistogram();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
-   */
-  CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
-   * @return Reference of this instance
-   */
-  CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
-   */
-  CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
-   * @return Reference of this instance
-   */
-  CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[out] hist_buf Buffer of histogram
-   * @param[out] glob_sum_buf Buffer of global sum
-   * @param[in] bits Number of bits to be used for radix sort
-   * return N/A
-   */
-  void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
-
-  /*
-   * @brief Run CLRadixSortPasteHistogram op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/**
- * @brief Class to define CLRadixSortReorder
- */
-class CLRadixSortReorder : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLRadixSortReorder();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
-   */
-  CLRadixSortReorder(const CLRadixSortReorder &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
-   * @return Reference of this instance
-   */
-  CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
-   */
-  CLRadixSortReorder(CLRadixSortReorder &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
-   * @return Reference of this instance
-   */
-  CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[out] hist_buf Buffer of histogram
-   * @param[in] bits Number of bits to be used for radix sort
-   * @param[in] n Integer number size to sort
-   * return N/A
-   */
-  void configure(cl::Buffer *hist_buf, int bits, int n);
-
-  /**
-   * @brief Set pass
-   * @param[in] pass Passes made of in radix sort algorithm
-   * @param[in] in_key_buf Buffer of input key
-   * @param[out] out_key_buf Buffer of output key
-   * @param[in] in_ind_buf Buffer of input index
-   * @param[out] out_ind_buf Buffer of output index
-   * return N/A
-   */
-  void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
-               cl::Buffer *out_ind_buf)
-  {
-    _pass = pass;
-    _in_key_buf = in_key_buf;
-    _out_key_buf = out_key_buf;
-    _in_ind_buf = in_ind_buf;
-    _out_ind_buf = out_ind_buf;
-  }
-  /*
-   * @brief Run CLRadixSortReorder op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  int _pass;
-  cl::Buffer *_in_key_buf;
-  cl::Buffer *_out_key_buf;
-  cl::Buffer *_in_ind_buf;
-  cl::Buffer *_out_ind_buf;
-};
-
-/**
- * @brief Class to define CLTopKV2FindFirstNegative
- */
-class CLTopKV2FindFirstNegative : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLTopKV2FindFirstNegative();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
-   */
-  CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
-   * @return Reference of this instance
-   */
-  CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
-   */
-  CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
-   * @return Reference of this instance
-   */
-  CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[out] first_negative_idx_buf Buffer of the first negative index
-   * @param[in] n Number times to find
-   * return N/A
-   */
-  void configure(cl::Buffer *first_negative_idx_buf, int n);
-
-  /**
-   * @brief Set output buffer
-   * @param[out] out_key_buf Buffer of output key
-   * return N/A
-   */
-  void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; }
-
-  /*
-   * @brief Run CLTopKV2FindFirstNegative op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  cl::Buffer *_out_key_buf;
-};
-
-/**
- * @brief Class to define CLTopKV2ReorderNegatives
- */
-class CLTopKV2ReorderNegatives : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLTopKV2ReorderNegatives();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
-   */
-  CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
-   * @return Reference of this instance
-   */
-  CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
-   */
-  CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
-   * @return Reference of this instance
-   */
-  CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[out] first_negative_idx_buf Buffer of the first negative index
-   * @param[in] n Number times to find
-   * return N/A
-   */
-  void configure(cl::Buffer *first_negative_idx_buf, int n);
-
-  /**
-   * @brief Set buffers
-   * @param[in] in_key_buf Buffer of input key
-   * @param[out] out_key_buf Buffer of output key
-   * @param[in] in_ind_buf Buffer of input index
-   * @param[out] out_ind_buf Buffer of output index
-   * return N/A
-   */
-  void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
-                  cl::Buffer *out_ind_buf)
-  {
-    _in_key_buf = in_key_buf;
-    _out_key_buf = out_key_buf;
-    _in_ind_buf = in_ind_buf;
-    _out_ind_buf = out_ind_buf;
-  }
-
-  /*
-   * @brief Run CLTopKV2ReorderNegatives op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  cl::Buffer *_in_key_buf;
-  cl::Buffer *_out_key_buf;
-  cl::Buffer *_in_ind_buf;
-  cl::Buffer *_out_ind_buf;
-};
-
-/**
- * @brief Class to define CLTopKV2Store
- */
-class CLTopKV2Store : public ICLKernel
-{
-public:
-  /**
-   * @brief Constructor
-   */
-  CLTopKV2Store();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
-   */
-  CLTopKV2Store(const CLTopKV2Store &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
-   * @return Reference of this instance
-   */
-  CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
-   */
-  CLTopKV2Store(CLTopKV2Store &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
-   * @return Reference of this instance
-   */
-  CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
-
-  /**
-   * @brief Initialise kernel with params
-   * @param[out] values Values tensor to store
-   * @param[out] indices Indices tensor to be used for store
-   * @param[in] k K of the top k predictions
-   * @param[in] n Number times to store
-   * return N/A
-   */
-  void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
-
-  /**
-   * @brief Set buffers
-   * @param[out] out_key_buf Buffer of output key
-   * @param[out] out_ind_buf Buffer of output index
-   * return N/A
-   */
-  void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
-
-  /*
-   * @brief Run CLTopKV2Store op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  ICLTensor *_values;
-  ICLTensor *_indices;
-  cl::Buffer *_out_key_buf;
-  cl::Buffer *_out_ind_buf;
-};
-
-} // namespace arm_compute
-#endif // Disable GPU implementation
-#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
deleted file mode 100644
index 933d8760d..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
-#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-class ITensor;
-class Window;
-class QuantizationInfo;
-} // namespace arm_compute
-
-namespace arm_compute
-{
-
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    float (*scalar_func)(const float &, const float &),
-                    int (*broadcast_func)(int, int, int, const float *, const float &, float *,
-                                          const bool),
-                    int (*neon_func)(int, int, int, const float *, const float *, float *));
-
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
-                    int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
-                                          uint8_t *, const bool),
-                    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *));
-} // namespace arm_compute
-#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h
deleted file mode 100644
index a827f48f8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__
-#define __ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_fp16.h>
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the activation layer kernel. */
-class NEActivationLayerKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NEActivationLayerKernelEx"; }
-  /** Constructor */
-  NEActivationLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEActivationLayerKernelEx(const NEActivationLayerKernelEx &) = delete;
-  /** Default move constructor */
-  NEActivationLayerKernelEx(NEActivationLayerKernelEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEActivationLayerKernelEx &operator=(const NEActivationLayerKernelEx &) = delete;
-  /** Default move assignment operator */
-  NEActivationLayerKernelEx &operator=(NEActivationLayerKernelEx &&) = default;
-  /** Set the input and output tensor.
-   *
-   * @note If the output tensor is a nullptr, the activation function will be performed in-place
-   *
-   * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this
-   * tensor will store the result
-   *                                 of the activation function. Data types supported:
-   * QASYMM8/QSYMM16/F16/F32.
-   * @param[out]     output          Destination tensor. Data type supported: same as @p input
-   * @param[in]      activation_info Activation layer information.
-   */
-  void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEActivationLayerKernelEx
-   *
-   * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor
-   * will store the result
-   *                     of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
-   * @param[in] output   Destination tensor info. Data type supported: same as @p input
-   * @param[in] act_info Activation layer information.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ActivationLayerInfo &act_info);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  using ActivationFunction = ActivationLayerInfo::ActivationFunction;
-  /** Common signature for all the specialised @ref NEActivationLayerKernelEx functions
-   *
-   * @param[in] window Region on which to execute the kernel.
-   */
-  using ActivationFunctionExecutorPtr = void (NEActivationLayerKernelEx::*)(const Window &window);
-  /** Function to apply an activation function on a tensor.
-   *
-   * @param[in] window Region on which to execute the kernel
-   */
-  template <ActivationLayerInfo::ActivationFunction F, typename T>
-  typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-  activation(const Window &window);
-  /** Function to apply an activation function on a tensor.
-   *
-   * @param[in] window Region on which to execute the kernel
-   */
-  template <ActivationLayerInfo::ActivationFunction F, typename T>
-  typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type
-  activation(const Window &window);
-  /** Function to apply an activation function on a tensor.
-   *
-   * @param[in] window Region on which to execute the kernel
-   */
-  template <ActivationLayerInfo::ActivationFunction F, typename T>
-  typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type
-  activation(const Window &window);
-
-private:
-  ITensor *_input;
-  ITensor *_output;
-  ActivationFunctionExecutorPtr _func;
-  ActivationLayerInfo _act_info;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
deleted file mode 100644
index 8c544cda8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
-#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-
-class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel
-{
-public:
-  /** Default destructor */
-  ~NEBinaryLogicalOperationKernel() = default;
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEBinaryLogicalOperationKernel
-   *
-   * @param[in] op     Binary logical operation to be executed.
-   * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8.
-   * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-   * @param[in] output Output tensor. Data types supported: Same as @p input1.
-   */
-  void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2,
-                 ITensor *output);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEBinaryLogicalOperationKernel
-   *
-   * @param[in] op     Binary logical operation to be executed.
-   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
-   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-   *
-   * @return a Status
-   */
-  static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1,
-                         const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-  // Inherited methods overridden:
-  static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
-                                   const ITensorInfo &output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
deleted file mode 100644
index 101f6ac8e..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__
-#define __ARM_COMPUTE_NECASTBOOLKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/**
- * @brief Class for the kernel converting boolean type
- */
-class NECastBoolKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NECastBoolKernel"; }
-  /** Default constructor*/
-  NECastBoolKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NECastBoolKernel(const NECastBoolKernel &) = delete;
-  /** Default move constructor */
-  NECastBoolKernel(NECastBoolKernel &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NECastBoolKernel &operator=(const NECastBoolKernel &) = delete;
-  /** Default move assignment operator */
-  NECastBoolKernel &operator=(NECastBoolKernel &&) = default;
-  /** Set the input and output of the kernel
-   *
-   * Valid conversions Input -> Output :
-   *
-   *   - U8             -> U8, S8, U16, S16, U32, S32, F32, F16
-   *
-   * @param[in]  input  The input tensor to convert. Data types supported: U8
-   * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-   */
-  void configure(const ITensor *input, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NECastBoolKernel
-   *
-   * @param[in] input  Source tensor info. Data types supported: U8
-   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_input;
-  ITensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NECASTBOOLKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
deleted file mode 100644
index 88f21c96e..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
-#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform EmbeddingLookup operation */
-class NEEmbeddingLookupKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NEEmbeddingLookupKernel"; }
-  /** Default constructor */
-  NEEmbeddingLookupKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  NEEmbeddingLookupKernel(const NEEmbeddingLookupKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  NEEmbeddingLookupKernel &operator=(const NEEmbeddingLookupKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  NEEmbeddingLookupKernel(NEEmbeddingLookupKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  NEEmbeddingLookupKernel &operator=(NEEmbeddingLookupKernel &&) = default;
-  /** Initialize the kernel's input, output.
-   *
-   * @param[in]  input   Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output  Destination tensor. Data types supported: same as @p input.
-   * @param[in]  lookups Lookups are 1D tensor that values are indices into the first dimension of
-   * input.
-   */
-  void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEEmbeddingLookupKernel
-   *
-   * @param[in] input   Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output  Destination tensor. Data types supported: same as @p input.
-   * @param[in] lookups Lookups info. Data types supported: S32.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *lookups);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_input;
-  const ITensor *_lookups;
-  ITensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
deleted file mode 100644
index 5acfde5a8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
-#define __ARM_COMPUTE_NEGATHERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform other operation on NEON */
-class NEGatherKernelEx : public INEKernel
-{
-public:
-  /** Default constructor. */
-  NEGatherKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  NEGatherKernelEx(const NEGatherKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete;
-  /** Allow instances of this class to be moved. */
-  NEGatherKernelEx(NEGatherKernelEx &&) = default;
-  /** Allow instances of this class to be moved. */
-  NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default;
-  /** Default detructor */
-  ~NEGatherKernelEx() = default;
-
-  /** Name of the kernel
-   *
-   * @return Kernel name
-   */
-  const char *name() const override { return "NEGatherKernelEx"; }
-  /** Initialise the kernel's inputs and outputs
-   *
-   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
-   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-   * @param[out] output  Destination tensor. Data type supported: Same as @p input
-   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values
-   * wrap around. Defaults to 0
-   */
-  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEGatherKernelEx
-   *
-   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
-   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
-   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values
-   * wrap around. Defaults to 0
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
-                         const ITensorInfo *output, int axis);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  /** Implementation of the gather operation for 0 axis.
-   *
-   * For gather on the 0 axis an element by element copy is performed.
-   *
-   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
-   * returned by window())
-   * @param[in] info   Info about executing thread and CPU.
-   */
-  template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info);
-
-  /** Implementation of the gather operation.
-   *
-   * For 1<=axis a row-wise copy is taking place.
-   *
-   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
-   * returned by window())
-   * @param[in] info   Info about executing thread and CPU.
-   */
-  template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info);
-
-  using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info);
-
-  const ITensor *_input;
-  const ITensor *_indices;
-  int _axis;
-  size_t _indices_rank;
-  ITensor *_output;
-  kernel_ptr _func;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
deleted file mode 100644
index cb2a485d5..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
-#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform HashtableLookup operation */
-class NEHashtableLookupKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NEHashtableLookupKernel"; }
-  /** Default constructor */
-  NEHashtableLookupKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  NEHashtableLookupKernel(const NEHashtableLookupKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  NEHashtableLookupKernel &operator=(const NEHashtableLookupKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  NEHashtableLookupKernel(NEHashtableLookupKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  NEHashtableLookupKernel &operator=(NEHashtableLookupKernel &&) = default;
-  /** Initialize the kernel's inputs, outputs.
-   *
-   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
-   * input. Data types supported: S32
-   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
-   *                      Data types supported: S32
-   * @param[in]  input    Source tensor.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
-   * input.
-   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
-   * (True) or not (False). Data types supported: U8/QASYMM8
-   * input.
-   */
-  void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
-                 ITensor *hits);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEHashtableLookupKernel
-   *
-   * @param[in]  lookups  The lookups tensor info. Data types supported: S32.
-   * @param[in]  keys     The keys tensor info. keys and input pair represent a map.
-   *                      Data types supported: S32
-   * @param[in]  input    The input tensor info.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output   The output tensor info. Data types and data layouts supported: Same as @p
-   * input.
-   * @param[out] hits     The hits tensor info. A boolean tensor that indicates whether the lookup
-   * hits (True) or not (False). Data types supported: U8/QASYMM8
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
-                         const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *hits);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_lookups; /** Lookups tensor */
-  const ITensor *_keys;    /** Keys tensor */
-  const ITensor *_input;   /** Source tensor */
-  ITensor *_output;        /** Destination tensor */
-  ITensor *_hits;          /** Hits tensor */
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
deleted file mode 100644
index 8724cc69b..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
-#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for performing an instance normalization */
-class NEInstanceNormalizationLayerKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NEInstanceNormalizationLayerKernelEx"; }
-  /** Default constructor */
-  NEInstanceNormalizationLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEInstanceNormalizationLayerKernelEx(const NEInstanceNormalizationLayerKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEInstanceNormalizationLayerKernelEx &
-  operator=(const NEInstanceNormalizationLayerKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NEInstanceNormalizationLayerKernelEx(NEInstanceNormalizationLayerKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NEInstanceNormalizationLayerKernelEx &
-  operator=(NEInstanceNormalizationLayerKernelEx &&) = default;
-  /** Default destructor */
-  ~NEInstanceNormalizationLayerKernelEx() = default;
-  /** Set the input and output tensors.
-   *
-   * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported:
-   * NCHW
-   *                         In case of @p output tensor = nullptr this tensor will store the result
-   * of the normalization.
-   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor.
-   * Defaults to 1.0
-   * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor.
-   * Defaults to 0.0
-   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-   */
-  void configure(ITensor *input, ITensor *output, ITensor *gamma = nullptr, ITensor *beta = nullptr,
-                 float epsilon = 1e-12f);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEInstanceNormalizationLayer.
-   *
-   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
-   * NCHW
-   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults
-   * to 1.0
-   * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor.
-   * Defaults to 0.0
-   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
-                         float epsilon = 1e-12f);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  /** Common signature for all the specialized instance normalization functions
-   *
-   * @param[in, out] input   An input tensor. In case of @p output tensor = nullptr this tensor will
-   * store the result of the normalization.
-   * @param[out]     output  The output tensor.
-   * @param[in]      gamma   The scale scalar value applied to the normalized tensor. Defaults to
-   * 1.0
-   * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to
-   * 0.0
-   * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
-   */
-  using NormalizationFunction = void(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
-                                     float epsilon, const Window &window);
-
-  NormalizationFunction *_func;
-  ITensor *_input;
-  ITensor *_output;
-  ITensor *_gamma;
-  ITensor *_beta;
-  float _epsilon;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
deleted file mode 100644
index 198b0be9d..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
-#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface to multiply scale factor kernel. */
-class NEMultiplyScaleFactorKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NEMultiplyScaleFactorKernel"; }
-  /** Default constructor */
-  NEMultiplyScaleFactorKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEMultiplyScaleFactorKernel(const NEMultiplyScaleFactorKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEMultiplyScaleFactorKernel &operator=(const NEMultiplyScaleFactorKernel &) = delete;
-  /** Default Move Constructor. */
-  NEMultiplyScaleFactorKernel(NEMultiplyScaleFactorKernel &&) = default;
-  /** Default move assignment operator */
-  NEMultiplyScaleFactorKernel &operator=(NEMultiplyScaleFactorKernel &&) = default;
-  /** Default destructor */
-  ~NEMultiplyScaleFactorKernel() = default;
-  /** Set input, output tensors.
-   *
-   * @param[in/out] input  Source tensor. Data type supported: S32.
-   * @param[in]     scale_factor Scale tensor. Data type supported: F16/F32.
-   * @param[out]    output Destination tensor. Data type supported: Same as @p scale_factor.
-   */
-  void configure(const ITensor *input, const ITensor *scale_factor, ITensor *output,
-                 float multiplier = 1.f);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEMultiplyScaleFactorKernel
-   *
-   * @param[in] input  Input tensor info. Data types supported: S32.
-   * @param[in] scale_factor Scale tensor. Data type supported: F16/F32.
-   * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor,
-                         const ITensorInfo *output, float multiplier = 1.f);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  template <typename T> void multiply(const Window &window);
-
-private:
-  const ITensor *_input;
-  const ITensor *_scale_factor;
-  ITensor *_output;
-  float _multiplier;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
deleted file mode 100644
index 99bb351bc..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__
-#define __ARM_COMPUTE_NEONEHOTKERNEL_H__
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-/** Kernel to perform other operation on NEON */
-class NEOneHotKernel : public INEKernel
-{
-public:
-  /** Default constructor. */
-  NEOneHotKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  NEOneHotKernel(const NEOneHotKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  NEOneHotKernel &operator=(const NEOneHotKernel &) = delete;
-  /** Allow instances of this class to be moved. */
-  NEOneHotKernel(NEOneHotKernel &&) = default;
-  /** Allow instances of this class to be moved. */
-  NEOneHotKernel &operator=(NEOneHotKernel &&) = default;
-  /** Default detructor */
-  ~NEOneHotKernel() = default;
-  /** Name of the kernel
-   *
-   * @return Kernel name
-   */
-  const char *name() const override { return "NEOneHotKernel"; }
-  /** Initialise the kernel's inputs and outputs
-   *
- * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor for depth of the one hot dimension. Supported tensor rank: up to
- * 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same
- * as @p on_value
- * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
-   */
-  void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
-                 const ITensor *off_value, ITensor *output, int axis = -1);
-  /** Static function to check if given info will lead to a valid configuration of @ref
- * NEOneHotKernel
-   *
- * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
- * up to 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
- * Same as @p on_value
- * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *indices, const ITensorInfo *depth,
-                         const ITensorInfo *on_value, const ITensorInfo *off_value,
-                         const ITensorInfo *output, int axis = -1);
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  /** Implementation of the onehot operation for 0 axis.
-   *
-   * For onehot on the 0 axis an element by element copy is performed.
-   *
-   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
-   * returned by window())
-   * @param[in] info   Info about executing thread and CPU.
-   */
-  template <typename U> void onehot_0_axis(const Window &window, const ThreadInfo &info);
-  /** Implementation of the onehot operation.
-   *
-   * For 1<=axis a row-wise copy is taking place.
-   *
-   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
-   * returned by window())
-   * @param[in] info   Info about executing thread and CPU.
-   */
-  template <typename U> void onehot_n_axis(const Window &window, const ThreadInfo &info);
-  using kernel_ptr = void (NEOneHotKernel::*)(const Window &window, const ThreadInfo &info);
-  const ITensor *_indices;
-  const ITensor *_depth;
-  const ITensor *_on_value;
-  const ITensor *_off_value;
-  int _axis;
-  ITensor *_output;
-  kernel_ptr _func;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
deleted file mode 100644
index 0b080cf73..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
-#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the dequantization layer kernel. */
-class NEQuantizationSymmetricKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NEQuantizationSymmetricKernel"; }
-  /** Default constructor */
-  NEQuantizationSymmetricKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEQuantizationSymmetricKernel(const NEQuantizationSymmetricKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEQuantizationSymmetricKernel &operator=(const NEQuantizationSymmetricKernel &) = delete;
-  /** Default Move Constructor. */
-  NEQuantizationSymmetricKernel(NEQuantizationSymmetricKernel &&) = default;
-  /** Default move assignment operator */
-  NEQuantizationSymmetricKernel &operator=(NEQuantizationSymmetricKernel &&) = default;
-  /** Default destructor */
-  ~NEQuantizationSymmetricKernel() = default;
-  /** Set input, output tensors.
-   *
-   * @param[in]  input  Source tensor. Data type supported: F16/F32.
-   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
-   * S8.
-   * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
-   */
-  void configure(const ITensor *input, ITensor *output, ITensor *scale_factor);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEQuantizationSymmetricKernel
-   *
-   * @param[in] input  Input tensor info. Data types supported: F16/F32.
-   * @param[in] output Output tensor info. Data types supported: S8.
-   * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *scale_factor);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  template <typename T> void quantize(const Window &window);
-
-private:
-  const ITensor *_input;
-  ITensor *_output;
-  ITensor *_scale_factor;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h
deleted file mode 100644
index c9024fbb3..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__
-#define __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a reduction operation */
-class NEReductionOperationKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NEReductionOperationKernelEx"; }
-  /** Default constructor */
-  NEReductionOperationKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEReductionOperationKernelEx(const NEReductionOperationKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEReductionOperationKernelEx &operator=(const NEReductionOperationKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NEReductionOperationKernelEx(NEReductionOperationKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NEReductionOperationKernelEx &operator=(NEReductionOperationKernelEx &&) = default;
-  /** Default destructor */
-  ~NEReductionOperationKernelEx() = default;
-
-  /** Set the source, destination of the kernel
-   *
-   * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported:
-   * NCHW.
-   * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input.
-   *                    Output will have the same number of dimensions as input.
-   * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
-   * @param[in]  op     Reduction operation to perform.
-   */
-  void configure(const ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEReductionOperationKernelEx.
-   *
-   * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts
-   * supported: NCHW.
-   * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p
-   * input.
-   *                   Output will have the same number of dimensions as input.
-   * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
-   * @param[in] op     Reduction operation to perform.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
-                         ReduceOperation op);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-  BorderSize border_size() const override;
-
-private:
-  const ITensor *_input;
-  ITensor *_output;
-  unsigned int _reduction_axis;
-  ReduceOperation _op;
-  BorderSize _border_size;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
deleted file mode 100644
index faba8a449..000000000
--- a/compute/ARMComputeEx/arm_compute/core/TypesEx.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_TYPESEX_H__
-#define __ARM_COMPUTE_TYPESEX_H__
-
-namespace arm_compute
-{
-
-/** Available ArgIndex operations **/
-enum class ArgOperation
-{
-  MAX,
-  MIN,
-};
-
-/** Available reduce operations */
-enum class ReduceOperation
-{
-  MAX,  /**< Max */
-  MEAN, /**< Mean */
-  SUM,  /**< Sum */
-  MIN,  /**< Min */
-};
-
-/** Available binary logical operations */
-enum class BinaryLogicalOperation
-{
-  AND, /**< AND */
-  OR,  /**< OR */
-};
-
-enum class ComparisonOperationEx
-{
-  EQUAL,     /**< EQUAL */
-  NOT_EQUAL, /**< NOT_EQUAL */
-};
-
-enum class ElementWiseUnaryEx
-{
-  NEG, /**< NEG */
-};
-
-enum class SubDataType
-{
-  NONE,
-  BOOL,
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_TYPESEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
deleted file mode 100644
index d57e8fcf5..000000000
--- a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_UTILSEX_H__
-#define __ARM_COMPUTE_UTILSEX_H__
-
-#include <utility>
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-
-/** Returns expected width and height of the transpose convolution's output tensor.
- *
- * @note This function was copied in order to fix a bug computing to wrong output dimensions.
- *
- * @param[in] in_width      Width of input tensor (Number of columns)
- * @param[in] in_height     Height of input tensor (Number of rows)
- * @param[in] kernel_width  Kernel width.
- * @param[in] kernel_height Kernel height.
- * @param[in] info          padding and stride info.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_top   The number of zeros added to bottom edge of the output.
- *
- * @return A pair with the new width in the first position and the new height in the second.
- */
-const std::pair<unsigned int, unsigned int>
-transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                unsigned int kernel_width, unsigned int kernel_height,
-                                const PadStrideInfo &info, unsigned int invalid_right,
-                                unsigned int invalid_top);
-}
-#endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
deleted file mode 100644
index 1e69f0912..000000000
--- a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
-#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Utils.h"
-
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
-
-#include <cmath>
-
-namespace arm_compute
-{
-namespace misc
-{
-namespace shape_calculator
-{
-
-/** Calculate the upsampled output shape used for transpose convolution
- *
- * @param[in] input              Input tensor info
- * @param[in] weights            Weights tensor shape
- * @param[in] info               Padding and stride info
- * @param[in] out_dims           Output shape dimensions
- * @param[in] invalid_right      The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom     The number of zeros added to bottom edge of the output.
- * @param[out] pad_left          Padding on left
- * @param[out] pad_right         Padding on right
- * @param[out] pad_top           Padding on top
- * @param[out] pad_bottom        Padding on bottom
- *
- * @return the calculated shape
- */
-inline TensorShape compute_transposeconv_upsampled_shape(
-    const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
-    std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
-    unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
-    unsigned int &pad_top, unsigned int &pad_bottom)
-{
-  unsigned int sx = info.stride().first;
-  unsigned int sy = info.stride().second;
-  const DataLayout data_layout = input.data_layout();
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-  // Find the upsampled dimensions
-  // transpose conv out:
-  //    tconv_out + pad = 1 + (in - 1) * stride + invalid
-  //    tconv_out = 1 + (in - 1) * stride + invalid - pad
-  // upsample out:
-  //    upsample_out = 1 + (in - 1) * stride
-  unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1;
-  unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1;
-
-  // Find the padding needed for the convolution with stride 1 in order to match output shape
-  // upsample+pad out:
-  //    upsample_out + pad = tconv_out + kernel - 1
-  //    pad = tconv_out + kernel - 1 - upsample_out
-  unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
-  unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
-  out_x += padx;
-  out_y += pady;
-
-  unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right;
-  unsigned int pady_all_except_invallid =
-      pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
-  pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left();
-  pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right;
-  pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top();
-  pad_bottom = pady_all_except_invallid / 2 - info.pad_bottom() + invalid_bottom;
-
-  TensorShape scale_out_shape(input.tensor_shape());
-  scale_out_shape.set(idx_w, out_x);
-  scale_out_shape.set(idx_h, out_y);
-
-  return scale_out_shape;
-}
-
-/** Calculate the output shape of the transpose convolution layer
- *
- * @param[in] out_dims Output x and y shape dimensions
- * @param[in] input    Input tensor info
- * @param[in] weights  Weights tensor shape
- *
- * @return the calculated shape
- */
-inline TensorShape
-compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
-                                   const ITensorInfo &input, const ITensorInfo &weights)
-{
-  const TensorShape input_shape{input.tensor_shape()};
-  const TensorShape weights_shape{weights.tensor_shape()};
-
-  const DataLayout data_layout = input.data_layout();
-  const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const int channel_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-  const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-  TensorShape out_shape{input_shape};
-  out_shape.set(width_idx, out_dims.first);
-  out_shape.set(height_idx, out_dims.second);
-  out_shape.set(channel_idx, weights_shape[batch_idx]);
-  return out_shape;
-}
-
-/** Calculate the depth to space output shape of a tensor
- *
- * @param[in] input Input tensor info
- * @param[in] block Block shape value
- *
- * @return the calculated shape
- */
-inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int block)
-{
-  ARM_COMPUTE_ERROR_ON(block < 2);
-
-  const DataLayout data_layout = input->data_layout();
-  const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const int idx_channel =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-  TensorShape output_shape{input->tensor_shape()};
-  output_shape.set(idx_width, input->dimension(idx_width) * block);
-  output_shape.set(idx_height, input->dimension(idx_height) * block);
-  output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block));
-
-  return output_shape;
-}
-
-/** Calculate the space to batch output shape of a tensor
- *
- * @param[in] input       Input tensor info
- * @param[in] block_shape Block shape value
- *
- * @return the calculated shape
- */
-inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int32_t block_shape)
-{
-  ARM_COMPUTE_ERROR_ON(block_shape < 2);
-  TensorShape output_shape{input->tensor_shape()};
-
-  const DataLayout data_layout = input->data_layout();
-  const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-  output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape);
-  output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape);
-  output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape));
-
-  return output_shape;
-}
-
-/** Calculate the gather output shape of a tensor
- *
- * @param[in] input_shape   Input tensor shape
- * @param[in] indices_shape Indices tensor shape
- * @param[in] actual_axis   The axis to be gathered
- *
- * @return the calculated shape
- */
-inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape,
-                                           const TensorShape &indices_shape, uint32_t actual_axis)
-{
-  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
-  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
-  ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
-
-  TensorShape output_shape = input_shape;
-  if (indices_shape.num_dimensions() == 1)
-  {
-    output_shape[actual_axis] = indices_shape[0];
-  }
-  else if (indices_shape.num_dimensions() > 1)
-  {
-    output_shape.shift_right(indices_shape.num_dimensions() - 1);
-
-    for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
-    {
-      if (o == actual_axis)
-      {
-        ++i;
-        for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
-        {
-          output_shape[o] = indices_shape[in];
-        }
-      }
-      else
-      {
-        output_shape[o] = input_shape[i];
-      }
-    }
-  }
-  return output_shape;
-}
-
-/** Calculate the gather output shape of a tensor
- *
- * @param[in] input_shape   Input tensor shape
- * @param[in] indices_shape Indices tensor shape
- * @param[in] actual_axis   The axis to be gathered
- *
- * @return the calculated shape
- */
-inline TensorShape compute_onehot_shape_ex(const TensorShape &indices_shape, uint32_t depth,
-                                           uint32_t actual_axis)
-{
-  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
-  ARM_COMPUTE_ERROR_ON(actual_axis > indices_shape.num_dimensions());
-
-  TensorShape output_shape;
-  output_shape.set(actual_axis, depth);
-
-  unsigned int i_shift = 0;
-  for (unsigned int i = 0; i < indices_shape.num_dimensions(); ++i)
-  {
-    if (i == actual_axis)
-    {
-      i_shift++;
-    }
-    output_shape.set(i + i_shift, indices_shape[i]);
-  }
-
-  return output_shape;
-}
-
-} // namespace shape_calculator
-} // namespace misc
-} // namespace arm_compute
-
-#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
deleted file mode 100644
index 484ebfd0b..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
-#define __ARM_COMPUTE_CLFUNCTIONSEX_H__
-
-#include <arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h>
-#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
-#include <arm_compute/runtime/CL/functions/CLCastBool.h>
-#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
-#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
-#include <arm_compute/runtime/CL/functions/CLGatherEx.h>
-#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
-#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
-#include <arm_compute/runtime/CL/functions/CLNeg.h>
-#include <arm_compute/runtime/CL/functions/CLOneHot.h>
-#include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
-#include <arm_compute/runtime/CL/functions/CLSplitVEx.h>
-#include <arm_compute/runtime/CL/functions/CLTopKV2.h>
-#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
-
-#endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
deleted file mode 100644
index b1ee52bf9..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
-#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
-
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-class ITensorInfo;
-class ICLTensor;
-
-/** Function to calculate the index of the minimum or maximum values in a
- *  tensor based on an axis.
- *
- * @note The default data type for an uninitialized output tensor is
- *       signed 32-bit integer (S32). It is the user's responsibility to check
- *       that the results do not overflow because the indices are computed
- *       in unsigned 32-bit (U32).
- */
-class CLArgMinMaxLayerEx : public IFunction
-{
-public:
-  /** Default Constructor.
-   *
-   * @param[in] memory_manager (Optional) Memory manager.
-   */
-  CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Input source tensor. Data types supported: QASYMM8/F16/F32.
-   * @param[in]  axis   Axis to find max/min index.
-   * @param[out] output Output source tensor. Data types supported: U32/S32.
-   * @param[in]  op     Reduction operation to perform. Operations supported: ARG_IDX_MAX,
-   * ARG_IDX_MIN
-   */
-  void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLArgMinMaxLayerEx
-   *
-   * @param[in] input  Input source tensor info. Data types supported: QASYMM8/F16/F32.
-   * @param[in] axis   Axis to find max/min index.
-   * @param[in] output Output source tensor info. Data types supported: U32/S32.
-   * @param[in] op     Reduction operation to perform. Operations supported: ARG_IDX_MAX,
-   * ARG_IDX_MIN
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
-                         const ReductionOperation &op);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  MemoryGroup _memory_group;
-  std::vector<CLTensor> _results_vector;
-  CLTensor _not_reshaped_output;
-  std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector;
-  CLReshapeLayerKernel _reshape_kernel;
-  unsigned int _num_of_stages;
-  unsigned int _reduction_axis;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
deleted file mode 100644
index 88a9b00ec..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__
-#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-class CLBinaryLogicalOp : public ICLSimpleFunction
-{
-public:
-  /** Initialise the function's source and destination.
-   *
-   * @param[in]  input1  Source tensor1. Data types supported: U8, QASYMM8.
-   * @param[in]  input2  Source tensor2. Data types supported: U8 QASYMM8.
-   * @param[out] output Output tensor. Data types supported: U8, QASYMM8.
-   */
-  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
-                 BinaryLogicalOperation op);
-};
-
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
deleted file mode 100644
index d6150684a..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLCastBool.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLCastBool class
- */
-
-#ifndef ARM_COMPUTE_CLCASTBOOL_H
-#define ARM_COMPUTE_CLCASTBOOL_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLCastBoolKernel.
- * This converts the boolean input tensor to the output tensor's type.
- */
-class CLCastBool : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Initialise the kernel's input and output
-   * @param[in]  input   Input tensor. Data types supported: U8
-   * @param[out] output  Output tensor. Data types supported: U8/S8/U16/S16/U32/F16/F32.
-   */
-  void configure(ICLTensor *input, ICLTensor *output);
-};
-}
-#endif /* ARM_COMPUTE_CLCASTBOOL_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
deleted file mode 100644
index 409eaf593..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
-#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
-
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
-#include "arm_compute/runtime/CL/functions/CLReverse.h"
-#include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Function to run the deconvolution layer.
- *
- * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
- * depending on the stride and pad info and then perform a 1x1
- * convolution pass. Input stride defines how many zeroes we should put between each element of the
- * input and pad is the amount of padding.
- *
- *  The relation between input to output is as follows:
- *  \f[
- *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
- *  \f]
- *  \f[
- *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
- *  \f]
- *
- *  where:
- *      width_input is the size of the first input dimension.
- *      height_input is the size of the second input dimension.
- *      width_output is the size of the first output dimension.
- *      height_output is the size of the second output dimension.
- *      kernel_x and kernel_y are the convolution sizes in x and y.
- *      stride_x and stride_y is the input stride of the first and second dimension.
- *
- * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
- * Therefore, it will be necessary to use the weights in the
- * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse.
- *
- * This function calls the following OpenCL kernels/functions:
- *
- * -# @ref CLDeconvolutionLayerUpsample
- * -# @ref CLConvolutionLayer
- *
- * And the following CPP kernels:
- * -# @ref CLReverse
- *
- */
-class CLDirectTransposeConvLayer : public IFunction
-{
-public:
-  /** Constructor */
-  CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete;
-  /** Default move constructor */
-  CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete;
-  /** Default move assignment operator */
-  CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
-  /** Set the input, weights, biases and output tensors.
-   *
-   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs.
-   *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias         (Optional) The biases have one dimension.
-   *                             Data type supported: Should match @p input data type, except for
- * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
- * @p input.
-   * @param[in]     info         Contains padding and policies to be used in the deconvolution, this
- * is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
-   *
-   */
-  void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
-                 const WeightsInfo &weights_info = WeightsInfo());
-  /** Set the input, weights, biases and output tensors.
-   *
-   * @param[in]     compile_context The compile context to be used.
-   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
- * an optional 4th dimension for batch of inputs.
-   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in]     bias            (Optional) The biases have one dimension.
-   *                                Data type supported: Should match @p input data type, except for
- * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-   * @param[out]    output          Output tensor. The output has the same number of dimensions as
- * the @p input.
-   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
- * this is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info    (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
-   *
-   */
-  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
-                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                 unsigned int invalid_right, unsigned int invalid_bottom,
-                 const WeightsInfo &weights_info = WeightsInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
- * CLDirectTransposeConvLayer
-   *
-   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs.
-   *                         Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in] bias         (Optional) The biases have one dimension.
-   *                         Data type supported: Should match @p input data type, except for input
- * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
- * @p input.
-   * @param[in] info         Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                         unsigned int invalid_right, unsigned int invalid_bottom,
-                         const WeightsInfo &weights_info = WeightsInfo());
-
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-  CLDeconvolutionLayerUpsample _scale_f;
-  CLConvolutionLayer _conv_f;
-  CLReverse _flip_weights;
-
-  CLTensor _scaled_output;
-  ICLTensor *_original_weights;
-  CLTensor _weights_flipped;
-  CLTensor _flip_axis;
-
-  bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
deleted file mode 100644
index fbee7e40e..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLEmbeddingLookup.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLEmbeddingLookup class
- */
-
-#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
-#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to perform EmbeddingLookup operation
- */
-class CLEmbeddingLookup : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Set the input and output tensors.
-   * @param[in]  input    Source tensor.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
-   *                      input.
-   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
-   *                      input.
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
-};
-}
-#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
deleted file mode 100644
index f3266f688..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__
-#define __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
-#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
-#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
-
-namespace arm_compute
-{
-/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls
- * the following kernels:
- *
- *  -# @ref CLTransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedHybridLayerReshapeWeights : public ICLSimpleFunction
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * S8.
-   * @param[out] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedHybridLayerReshapeWeights
-   *
-   * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * S8.
-   * @param[in] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following
- * OpenCL kernels:
- *
- *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
- * and transpose_weights is set to true ) (called once)
- *  -# @ref CLGEMMLowpMatrixMultiplyCore (if quantized symmetric)
- *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedHybridLayer : public IFunction
-{
-public:
-  /** Constructor */
-  CLFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLFullyConnectedHybridLayer(const CLFullyConnectedHybridLayer &) = delete;
-  /** Default move constructor */
-  CLFullyConnectedHybridLayer(CLFullyConnectedHybridLayer &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLFullyConnectedHybridLayer &operator=(const CLFullyConnectedHybridLayer &) = delete;
-  /** Default move assignment operator */
-  CLFullyConnectedHybridLayer &operator=(CLFullyConnectedHybridLayer &&) = default;
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input   Source tensor. Data type supported: F16/F32.
-   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
-   *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: S8.
-   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
-   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
-   * multiplication between:
-   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
-   * function is called after a Convolution Layer
-   *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  fc_info (Optional) Fully connected layer additional info
-   */
-  void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
-                 ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedHybridLayer
-   *
-   * @param[in]  input   Source tensor info. Data type supported: F16/F32.
-   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
-   *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: S8.
-   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
-   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
-   * matrix multiplication between:
-   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
-   * function is called after a Convolution Layer
-   *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  fc_info (Optional) Fully connected layer additional info
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *biases, const ITensorInfo *output,
-                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-
-  // Inherited methods override
-  void run() override;
-  void prepare() override;
-
-private:
-  void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output,
-                    bool retain_internal_weights);
-
-  MemoryGroup _memory_group;
-  CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
-  CLScaleFactorSymm8Kernel _scale_factor_kernel;
-  CLQuantizationSymmetricKernel _quant_input_kernel;
-  CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
-  CLMultiplyScaleFactorKernel _multiply_scale_kernel;
-  CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to
-                                                                // add bias in
-                                                                // CLFullyConnectedHybridLayer
-  CLTensor _reshape_weights_output;
-  CLTensor _quantized_input;
-  CLTensor _scale_factor;
-  CLTensor _gemmlowp_output;
-  bool _are_weights_reshaped;
-  bool _accumulate_biases;
-  bool _is_prepared;
-  const ICLTensor *_original_weights;
-};
-}
-#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
deleted file mode 100644
index e65a646dc..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__
-#define __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls
- * the following kernels:
- *
- *  -# @ref CLTransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedLayerReshapeWeightsEx : public ICLSimpleFunction
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[out] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedLayerReshapeWeightsEx
-   *
-   * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[in] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref
- * CLFullyConnectedLayerReshapeWeightsEx */
-class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights
-{
-public:
-  // Inherited method override
-  void run() override
-  {
-    _output.allocator()->allocate();
-    _func.run();
-    _reshape_run = true;
-  }
-
-  // Inherited method override
-  void release() override { _output.allocator()->free(); }
-
-  // Inherited method override
-  ICLTensor *get_weights() override { return &_output; }
-
-  // Inherited method override
-  uint32_t uid() override { return _uid; }
-
-  /** Configures the @ref CLFullyConnectedLayerReshapeWeightsEx function
-   *
-   * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32.
-   */
-  void configure(const ICLTensor *input) { _func.configure(input, &_output); }
-
-private:
-  static constexpr uint32_t _uid = 0x0;
-  CLTensor _output{};
-  CLFullyConnectedLayerReshapeWeightsEx _func{};
-};
-} // namespace weights_transformations
-
-/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following
- * OpenCL kernels:
- *
- *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLFullyConnectedLayerReshapeWeightsEx (if @p are_weights_reshaped is set to false and
- * transpose_weights is set to true ) (called once)
- *  -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized
- * asymmetric)
- *  -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref
- * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedLayerEx : public IFunction
-{
-public:
-  /** Constructor */
-  CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr,
-                          IWeightsManager *weights_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLFullyConnectedLayerEx(const CLFullyConnectedLayerEx &) = delete;
-  /** Default move constructor */
-  CLFullyConnectedLayerEx(CLFullyConnectedLayerEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLFullyConnectedLayerEx &operator=(const CLFullyConnectedLayerEx &) = delete;
-  /** Default move assignment operator */
-  CLFullyConnectedLayerEx &operator=(CLFullyConnectedLayerEx &&) = default;
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
-   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
-   *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
-   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
-   * multiplication between:
-   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
-   * function is called after a Convolution Layer
-   *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  fc_info (Optional) Fully connected layer additional info
-   */
-  void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
-                 ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedLayerEx
-   *
-   * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
-   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
-   *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
-   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
-   * matrix multiplication between:
-   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
-   * function is called after a Convolution Layer
-   *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  fc_info (Optional) Fully connected layer additional info
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *biases, const ITensorInfo *output,
-                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-
-  // Inherited methods override
-  void run() override;
-  void prepare() override;
-
-private:
-  void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias,
-                       ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
-  void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias,
-                         ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
-  void configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias,
-                    ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
-
-  MemoryGroup _memory_group;
-  IWeightsManager *_weights_manager;
-  CLConvertFullyConnectedWeights _convert_weights;
-  weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed;
-  weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged
-      _reshape_weights_managed_function;
-  CLFlattenLayer _flatten_layer;
-  CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function;
-  CLGEMM _mm_gemm;
-  CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
-  CLTensor _flatten_output;
-  CLTensor _converted_weights_output;
-  CLTensor _reshape_weights_output;
-  bool _are_weights_converted;
-  bool _are_weights_reshaped;
-  bool _is_fc_after_conv;
-  bool _is_quantized;
-  bool _is_prepared;
-  const ICLTensor *_original_weights;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
deleted file mode 100644
index 289ab167f..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file        CLFullyConnectedReshapingLayer.h
- * @brief       This file contains CLFullyConnectedReshapingLayer class
- * @ingroup     COM_AI_RUNTIME
- */
-
-#ifndef __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
-#define __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
-
-#include <arm_compute/runtime/CL/CLTensor.h>
-#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
-#include <arm_compute/runtime/IMemoryManager.h>
-
-namespace arm_compute
-{
-/**
- * @brief Class to run FullyConnected Layer after reshaping input tensor
- */
-class CLFullyConnectedReshapingLayer : public arm_compute::IFunction
-{
-public:
-  enum class KernelType
-  {
-    GENERAL,             //< General FC
-    PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed
-  };
-
-public:
-  CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
-      : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
-        _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false)
-  {
-    // DO NOTHING
-  }
-
-public:
-  /**
-   * @brief Configure the layer
-   * @param[in] input The source tensor
-   * @param[in] weights The tensor that is filled with weight values
-   * @param[in] biases The tensor that is filled with biase values
-   * @param[in] output The destination tensor
-   * @param[in] needs_reshape Whether it needs to be reshaped or not
-   * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
-   * @return N/A
-   */
-  void configure(const arm_compute::ICLTensor *input, const arm_compute::ICLTensor *weights,
-                 const arm_compute::ICLTensor *biases, arm_compute::ICLTensor *output,
-                 bool needs_reshape, const arm_compute::TensorShape &reshape,
-                 KernelType kernel_type);
-
-public:
-  /**
-   * @brief Run the operation. Must be called after configure().
-   * @return N/A
-   */
-  void run(void) override;
-  /**
-   * @brief Prepare the operation
-   * @return N/A
-   */
-  void prepare(void) override;
-
-private:
-  const arm_compute::ICLTensor *_input;
-  const arm_compute::ICLTensor *_weights;
-  const arm_compute::ICLTensor *_biases;
-  arm_compute::ICLTensor *_output;
-
-  // buffer for reshaping input tensor
-  arm_compute::CLTensor _cl_buffer;
-
-private:
-  std::shared_ptr<IMemoryManager> _memory_manager;
-  std::unique_ptr<arm_compute::IFunction> _cl_fc;
-  CLReshapeLayer _cl_reshape;
-  bool _needs_reshape;
-};
-} // namespace arm_compute
-
-#endif // __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
deleted file mode 100644
index b01ec4255..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLGatherEx.h
- * @brief       This file contains CLGatherEx class
- * @ingroup     COM_AI_RUNTIME
- */
-
-#ifndef __ARM_COMPUTE_CLGATHEREX_H__
-#define __ARM_COMPUTE_CLGATHEREX_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to to run @ref CLGatherKernel.
- */
-class CLGatherEx : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Initialise the kernel's inputs, output and convertion policy.
-   * @param[in]  input   An input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[in]  indices An indexes tensor. Data types supported: S32.
-   * @param[out] output  The output tensor, Data types supported: same as @p input.
-   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
-   * @return N/A
- */
-  void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration
-   *        of @ref CLGatherEx
-   * @param[in]  input   An input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[in]  indices An indexes tensor. Data types supported: S32.
-   * @param[out] output  The output tensor, Data types supported: same as @p input.
-   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
-                         const ITensorInfo *output, int axis = 0);
-};
-}
-#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
deleted file mode 100644
index 6618f5aa4..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLHashtableLookup.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLHashtableLookup class
- */
-
-#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
-#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to perform HashtableLookup operation
- */
-class CLHashtableLookup : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Set the input and output tensors.
-   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
-   *                      input.
-   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
-   *                      Data types supported: S32
-   * @param[in]  input    Source tensor.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
-   *                      input.
-   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
-   *                      (True) or not (False). Data types supported: U8/QASYMM8
-   * @return N/A
-   */
-  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
-                 ICLTensor *output, ICLTensor *hits);
-};
-}
-#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
deleted file mode 100644
index 887e7aaa5..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
-#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to perform a Instance normalization.
- *
- * This function runs the following kernels:
- * -# @ref CLInstanceNormalizationLayerKernelEx
- */
-class CLInstanceNormalizationLayerEx : public ICLSimpleFunction
-{
-public:
-  /** Default constructor */
-  CLInstanceNormalizationLayerEx();
-  /** Set the input and output tensors.
-   *
-   * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will
-   * store the result of the normalization.
-   *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
-   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in]      gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults
-   * to nullptr
-   * @param[in]      beta    (Optional) The offset tensor applied to the normalized tensor. Defaults
-   * to nullptr
-   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-   */
-  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr,
-                 ICLTensor *beta = nullptr, float epsilon = 1e-12f);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLInstanceNormalizationLayerEx.
-   *
-   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
-   * NHWC, NCHW
-   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in] gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults to
-   * nullptr
-   * @param[in] beta    (Optional) The offset tensor applied to the normalized tensor. Defaults to
-   * nullptr
-   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
-                         float epsilon = 1e-12f);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
deleted file mode 100644
index 8ec9aa307..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLNEG_H__
-#define __ARM_COMPUTE_CLNEG_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-class CLNeg : public ICLSimpleFunction
-{
-public:
-  /** Initialise the function's source and destination.
-   *
-   * @param[in]  input  Source tensor. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output Output tensor. Data types supported: Same as @p input.
-   *
-   */
-  void configure(ICLTensor *input, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLNEG_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
deleted file mode 100644
index 2bbfca821..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLONEHOT_H__
-#define __ARM_COMPUTE_CLONEHOT_H__
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-namespace arm_compute
-{
-class ICLTensor;
-/** Basic function to run @ref CLOneHotKernel */
-class CLOneHot : public IFunction
-{
-public:
-  /** Constructor */
-  CLOneHot();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLOneHot(const CLOneHot &) = delete;
-  /** Default move constructor */
-  CLOneHot(CLOneHot &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLOneHot &operator=(const CLOneHot &) = delete;
-  /** Default move assignment operator */
-  CLOneHot &operator=(CLOneHot &&) = default;
-  /** Initialise the kernel's inputs and outputs
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
-   * Same as @p on_value
-   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  depth     The depth of the one hot dimension.
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * value must be in range [-indices.rank , indices.rank)
-   */
-  void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value,
-                 ICLTensor *output, int depth, int axis = -1);
-  /** Initialise the kernel's inputs and outputs with off_value being constant
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  off_value The PixelValue for off value. Data type supported: Same as @p on_value
-   * @param[in]  depth     The depth of the one hot dimension.
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * value must be in range [-indices.rank , indices.rank)
-   */
-  void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
-                 PixelValue off_value, int depth, int axis = -1);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLOneHotKernel
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
-   * Same as @p on_value
-   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  depth     The depth of the one hot dimension.
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * value must be in range [-indices.rank , indices.rank)
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
-                         const ITensorInfo *off_value, const ITensorInfo *output, int depth,
-                         int axis = -1);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  CLMemsetKernel _memset_kernel; /**< Memset kernel */
-  CLOneHotKernel _onehot_kernel; /**< OneHot kernel */
-  bool _has_to_memset;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLONEHOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
deleted file mode 100644
index 7dba84b12..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLReduceOperation.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLReduceOperation class
- */
-
-#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__
-#define __ARM_COMPUTE_CLREDUCEOPERATION_H__
-
-#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
-#include "arm_compute/core/TypesEx.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to perform ReduceOperation
- */
-class CLReduceOperation : public IFunction
-{
-public:
-  /**
-   * @brief Construct a new ReduceOperation object
-   */
-  CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager);
-
-  /**
-   * @brief Set the input and output tensors.
-   * @param[in]  input     Source tensor. Data types supported: U8/S32/F32
-   * @param[out] output    Destination tensor. Data types and data layouts supported: Same as @p
-   * input.
-   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[in]  keep_dims If positive, retains reduced dimensions with length 1.
-   * @param[in]  op        Reduce operation to perform.
-   * @return N/A
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis,
-                 bool keep_dims, ReduceOperation op);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   *        CLReduceOperation.
-   * @param[in] input     Source tensor info. Data types supported: U8/S32/F32
-   * @param[in] output    Destination tensor info. Data types and data layouts supported: Same as @p
-   * input.
-   * @param[in] axis      Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
-   * @param[in] op        Reduce operation to perform.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const std::set<uint32_t> &axis, bool keep_dims, const ReduceOperation &op);
-
-  /**
-   * @brief Run the OpenCL kernel for this operation
-   * @return N/A
-   */
-  void run() override;
-
-private:
-  MemoryGroup _memory_group;
-  ICLTensor *_input;
-  ICLTensor *_output;
-  std::set<uint32_t> _axis;
-  bool _keep_dims;
-
-  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
-  std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
-  CLReshapeLayer _reshape;
-};
-}
-#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
deleted file mode 100644
index bb741d98d..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLSPLITVEX__
-#define __ARM_COMPUTE_CLSPLITVEX__
-
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/CL/functions/CLSlice.h"
-#include "arm_compute/core/Types.h"
-#include <vector>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLSplitVKernel */
-class CLSplitVEx : public IFunction
-{
-public:
-  /** Default constructor */
-  CLSplitVEx();
-  /** Configure the split CL kernel
-   *
-   * @param[in]  input       The input tensor to split. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-   * @param[in]  size_splits A 1-D tensor containing the number of tensor values per split
-   * @param[out] outputs     A vector containing the output tensor. Data types supported: Same as @p
-   * input
-   *                         The output tensors should match the input tensor dimensions for all
-   * shape dimensions apart
-   *                         from the split dimension.
-   * @param[in]  split_dim   Integer value representing the input tensor dimension along which to
-   * split
-   * @param[in]  num_splits  Number of splits
-   */
-  void configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim,
-                 const std::vector<ICLTensor *> &outputs, unsigned int num_splits);
-
-  void run() override;
-
-private:
-  const ICLTensor *_input;
-  const ICLTensor *_size_splits;
-  std::vector<ICLTensor *> _outputs;
-  unsigned int _num_splits;
-  std::vector<CLSlice> _slice_functions;
-};
-}
-#endif /* __ARM_COMPUTE_CLSPLITVEX__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
deleted file mode 100644
index e301a5152..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLTopKV2.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLTopKV2 class
- */
-#ifndef __ARM_COMPUTE_CLTOPK_V2_H__
-#define __ARM_COMPUTE_CLTOPK_V2_H__
-
-#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
-
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to execute TopKV2 operation.
- */
-class CLTopKV2 : public IFunction
-{
-public:
-  /**
-   * @brief Construct a new CLTopKV2 object
-   */
-  CLTopKV2();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLTopKV2(const CLTopKV2 &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLTopKV2 &operator=(const CLTopKV2 &) = delete;
-
-  /**
-   * @brief Construct a new CLTopKV2 object by using copy constructor
-   * @param[in] CLTopKV2 object to move
-   */
-  CLTopKV2(CLTopKV2 &&) = default;
-
-  /**
-   * @brief Assign a CLTopKV2 object.
-   * @param[in] CLTopKV2 object to assign. This object will be moved.
-   */
-  CLTopKV2 &operator=(CLTopKV2 &&) = default;
-
-  /**
-   * @brief Initialise the kernel's inputs and outputs.
-   * @param[in]  input     Input image. Data types supported: U8/S16/F32.
-   * @param[in]  k         The value of `k`.
-   * @param[out] values    Top k values. Data types supported: S32 if input type is U8/S16, F32 if
-   * input type is F32.
-   * @param[out] indices   Indices related to top k values. Data types supported: S32 if input type
-   * is U8/S16, F32 if input type is F32.
-   * @return N/A
-   */
-  void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
-                 int total_bits = 32, int bits = 4);
-
-  /**
-   * @brief Run the kernels contained in the function
-   * Depending on the value of the following environment variables it works differently:
-   *   - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE",
-   *     quick sort on GPU is used.
-   *   - If the value of environment variable "ACL_TOPKV2" == ""GPU"",
-   *     radix sort on GPU is used.
-   *   - For other value, TopKV2 runs on CPU
-   * @return N/A
-   */
-  void run() override;
-
-private:
-  void run_on_cpu();
-  void run_on_gpu();
-  void run_on_gpu_single_quicksort();
-
-  uint32_t _k;
-  uint32_t _total_bits;
-  uint32_t _bits;
-  uint32_t _radix;
-  uint32_t _hist_buf_size;
-  uint32_t _glob_sum_buf_size;
-  uint32_t _n;
-
-  ICLTensor *_input;
-  ICLTensor *_values;
-  ICLTensor *_indices;
-
-  cl::Buffer _qs_idx_buf;
-  cl::Buffer _qs_temp_buf;
-  cl::Buffer _hist_buf;
-  cl::Buffer _glob_sum_buf;
-  cl::Buffer _temp_buf;
-  cl::Buffer _first_negative_idx_buf;
-  cl::Buffer _in_key_buf;
-  cl::Buffer _out_key_buf;
-  cl::Buffer _in_ind_buf;
-  cl::Buffer _out_ind_buf;
-
-  cl::Buffer *_p_in_key_buf;
-  cl::Buffer *_p_out_key_buf;
-  cl::Buffer *_p_in_ind_buf;
-  cl::Buffer *_p_out_ind_buf;
-// Disable GPU implementation
-// TODO Enable GPU implementation with verification, or remove code
-//      Invalid result on GPU
-#if 0
-  CLTopKV2Single _qs_kernel;
-  CLTopKV2Init _init_kernel;
-  CLRadixSortHistogram _hist_kernel;
-  CLRadixSortScanHistogram _scan_hist_kernel;
-  CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel;
-  CLRadixSortPasteHistogram _paste_hist_kernel;
-  CLRadixSortReorder _reorder_kernel;
-  CLTopKV2FindFirstNegative _find_first_negative_kernel;
-  CLTopKV2ReorderNegatives _reorder_negatives_kernel;
-  CLTopKV2Store _store_kernel;
-#endif
-};
-}
-#endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
deleted file mode 100644
index 5fb102e47..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
-
-#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic function to compute the deconvolution layer. This function calls the following OpenCL
- * kernels/functions:
- *
- * -# @ref CLGEMMDeconvolutionLayer
- * -# @ref CLDirectTransposeConvLayer
- */
-class CLTransposeConvLayer : public IFunction
-{
-public:
-  /** Default constructor */
-  CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
-  /** Set the input, weights, biases and output tensors.
-   *
-   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Same
- * as @p input.
-   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
- * @p input.
-   * @param[in]     deconv_info  Contains padding and policies to be used in the deconvolution, this
- * is described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
-   *
-   */
-  void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                 const PadStrideInfo &deconv_info, unsigned int invalid_right,
-                 unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo());
-  /** Set the input, weights, biases and output tensors.
-   *
-   * @param[in]     compile_context The compile context to be used.
-   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
- * an optional 4th dimension for batch of inputs. Data types supported:
- * QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in]     bias            (Optional) The biases have one dimension. Data type supported:
- * Same as @p input.
-   * @param[out]    output          Output tensor. The output has the same number of dimensions as
- * the @p input.
-   * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
- * this is described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info    (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
-   *
-   */
-  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
-                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                 unsigned int invalid_right, unsigned int invalid_bottom,
-                 const WeightsInfo &weights_info = WeightsInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
- * CLTransposeConvLayer
-   *
-   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Same as
- * @p input.
-   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
- * @p input.
-   * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is
- * described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *bias, ITensorInfo *output,
-                         const PadStrideInfo &deconv_info, unsigned int invalid_right,
-                         unsigned int invalid_bottom,
-                         const WeightsInfo &weights_info = WeightsInfo());
-
-  static DeconvolutionMethod
-  get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                           const ITensorInfo *bias, ITensorInfo *output,
-                           const PadStrideInfo &deconv_info, unsigned int invalid_right,
-                           unsigned int invalid_bottom, const WeightsInfo &weights_info);
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  std::shared_ptr<IMemoryManager> _memory_manager;
-  std::unique_ptr<IFunction> _function;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
deleted file mode 100644
index d47b1fe62..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
-#define __ARM_COMPUTE_NEFUNCTIONSEX_H__
-
-#include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
-#include <arm_compute/runtime/NEON/functions/NECastBool.h>
-#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
-#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
-#include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
-#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
-#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NEOneHot.h>
-#include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
-#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
-#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
-
-#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h
deleted file mode 100644
index 6156c84f8..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__
-#define __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NEActivationLayerKernelEx
- *
- * @note The function simulates an activation layer with the specified activation function.
- */
-class NEActivationLayerEx : public INESimpleFunctionNoBorder
-{
-public:
-  /** Constructor
-   *
-   * @param[in] ctx Runtime context to be used by the function
-   */
-  NEActivationLayerEx(IRuntimeContext *ctx = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEActivationLayerEx(const NEActivationLayerEx &) = delete;
-  /** Default move constructor */
-  NEActivationLayerEx(NEActivationLayerEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEActivationLayerEx &operator=(const NEActivationLayerEx &) = delete;
-  /** Default move assignment operator */
-  NEActivationLayerEx &operator=(NEActivationLayerEx &&) = default;
-  /** [NEActivationLayerEx snippet] **/
-  /** Set the input and output tensor.
-   *
-   * @note If the output tensor is a nullptr or is equal to the input, the activation function will
-   * be performed in-place
-   *
-   * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this
-   * tensor will store the result
-   *                                 of the activation function. Data types supported:
-   * QASYMM8/QSYMM16/F16/F32.
-   * @param[out]     output          Destination tensor. Data type supported: same as @p input
-   * @param[in]      activation_info Activation layer parameters.
-   */
-  void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
-  /** [NEActivationLayerEx snippet] **/
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEActivationLayerEx
-   *
-   * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor
-   * will store the result
-   *                     of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
-   * @param[in] output   Destination tensor info. Data type supported: same as @p input
-   * @param[in] act_info Activation layer information.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ActivationLayerInfo &act_info);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
deleted file mode 100644
index 026d30098..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
-#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
-
-#include "arm_compute/core/TypesEx.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEBinaryLogicalOperationKernel.
- *
- * @note The tensor data type for the inputs must be QASYMM8/U8.
- * @note The function performs a binary logical operation between two tensors.
- */
-class NEBinaryLogicalOperation : public INESimpleFunction
-{
-public:
-  /** Initialise the kernel's inputs, output and conversion policy.
-   *
-   * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8.
-   * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
-   * @param[out]     output Output tensor. Data types supported: Same as @p input1.
-   * @param[in]      op     Binary Logical Operation to be performed.
-   */
-  void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEBinaryLogicalOperationKernel
-   *
-   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
-   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-   * @param[in] op     Binary Logical Operation to be performed.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                         const ITensorInfo *output, BinaryLogicalOperation op);
-};
-
-/** Basic function to run @ref NEBinaryLogicalOperationKernel
- *
- * @note The tensor data type for the inputs must be QASYMM8/U8.
- * @note The function performs a binary logical operation between two tensors.
- */
-template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction
-{
-public:
-  /** Initialise the kernel's inputs, output and conversion policy.
-   *
-   * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8
-   * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
-   * @param[out]     output Output tensor. Data types supported: Same as @p input1.
-   */
-  void configure(ITensor *input1, ITensor *input2, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEBinaryLogicalOperationKernel
-   *
-   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8
-   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                         const ITensorInfo *output);
-};
-
-/** Basic function to run equal comparison. */
-using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
-/** Basic function to run not equal comparison. */
-using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
deleted file mode 100644
index c8b08af8d..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NECASTBOOL_H__
-#define __ARM_COMPUTE_NECASTBOOL_H__
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/**
- * @brief Class to run @ref NECastBoolKernel.
- */
-class NECastBool : public INESimpleFunction
-{
-public:
-  /** Initialize the function's source, destination
-   *
-   * Valid conversions Input -> Output :
-   *
-   *   - U8 -> U8, S8, U16, S16, U32, S32, F32, F16
-   *
-   * @param[in]  input  The input tensor to convert. Data types supported: U8
-   * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-   */
-  void configure(const ITensor *input, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref NECastBool
-   *
-   * @param[in] input  Source tensor info. Data types supported: U8.
-   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NECASTBOOL_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
deleted file mode 100644
index 63f7714aa..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file NEEmbeddingLookup.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::NEEmbeddingLookup class
- */
-
-#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
-#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-class ITensor;
-
-/**
- * @brief Class to perform EmbeddingLookup operation
- */
-class NEEmbeddingLookup : public INESimpleFunctionNoBorder
-{
-public:
-  /**
-   * @brief Set the input and output tensors.
-   * @param[in]  input    Source tensor.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
-   * input.
-   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
-   * input. Data types supported: S32.
-   * @return N/A
-   */
-  void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
-  /** Static function to check if given info will lead to a valid configuration of @ref NECopy
-   *
-   * @param[in] input  Source tensor info. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
-   * @param[in] output Lookups tensor info. Data types supported: S32.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *lookups);
-};
-}
-#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
deleted file mode 100644
index 56548a479..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
-#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
-
-#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-#include "arm_compute/runtime/Tensor.h"
-
-namespace arm_compute
-{
-/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls
- * the following kernels:
- *
- *  -# @ref NETransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[out] output Destination tensor. Data type supported: Same as @p input.
-   */
-  void configure(const ITensor *input, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEFullyConnectedHybridLayerReshapeWeights
-   *
-   * @param[in] input  Weights tensor info. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[in] output Destination tensor info. Data type supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
- * NEON kernels:
- *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
- * and transpose_weights is set to true ) (called once)
- *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
- * asymmetric)
- *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
- * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class NEFullyConnectedHybridLayer : public IFunction
-{
-public:
-  /** Constructor */
-  NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete;
-  /** Default move constructor */
-  NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete;
-  /** Default move assignment operator */
-  NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default;
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input   Source tensor. Data type supported: F16/F32.
-   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
-   *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: S8.
-   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
-   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
-   * multiplication between:
-   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
-   * function is called after a Convolution Layer
-   *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  fc_info (Optional) Fully connected layer additional info
-   */
-  void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
-                 ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEFullyConnectedHybridLayer
-   *
-   * @param[in]  input   Source tensor info. Data type supported: F16/F32.
-   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
-   *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: S8.
-   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
-   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
-   * matrix multiplication between:
-   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
-   * function is called after a Convolution Layer
-   *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  fc_info (Optional) Fully connected layer additional info
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *biases, const ITensorInfo *output,
-                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-
-  // Inherited methods override
-  void run() override;
-  void prepare() override;
-
-private:
-  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
-
-  MemoryGroup _memory_group;
-  NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
-  NEQuantizationSymmetricKernel _quant_input_kernel;
-  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
-  NEMultiplyScaleFactorKernel _multiply_scale_kernel;
-  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
-  Tensor _reshape_weights_output;
-  Tensor _quantized_input;
-  Tensor _scale_factor;
-  Tensor _gemmlowp_output;
-  const ITensor *_original_weights;
-  bool _are_weights_reshaped;
-  bool _accumulate_biases;
-  bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
deleted file mode 100644
index 8f98f220a..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
-#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/Tensor.h"
-
-namespace arm_compute
-{
-/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
- * NEON kernels:
- *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and
- * transpose_weights is set to true ) (called once)
- *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
- * asymmetric)
- *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
- * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- * @note  The difference from NEFullyConnectedLayer is that this class supports weights as input
- * with performance loss.
- */
-class NEFullyConnectedLayerEx : public IFunction
-{
-public:
-  /** Constructor */
-  NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
-  /** Default move constructor */
-  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
-  /** Default move assignment operator */
-  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
-   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
-   *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
-   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
-   * multiplication between:
-   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
-   * function is called after a Convolution Layer
-   *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  fc_info (Optional) Fully connected layer additional info
-   */
-  void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
-                 ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEFullyConnectedLayerEx
-   *
-   * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
-   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
-   *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
-   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
-   * matrix multiplication between:
-   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
-   * function is called after a Convolution Layer
-   *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
-   * @param[in]  fc_info (Optional) Fully connected layer additional info
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *biases, const ITensorInfo *output,
-                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-
-  // Inherited methods override
-  void run() override;
-  void prepare() override;
-
-private:
-  void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output);
-  void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output);
-  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
-
-  MemoryGroup _memory_group;
-  NEFlattenLayerKernel _flatten_kernel;
-  NEConvertFullyConnectedWeights _convert_weights;
-  NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
-  NEGEMM _mm_gemm;
-  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
-  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
-  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
-  Tensor _flatten_output;
-  Tensor _gemmlowp_output;
-  Tensor _converted_weights_output;
-  Tensor _reshape_weights_output;
-  const ITensor *_original_weights;
-  bool _are_weights_converted;
-  bool _are_weights_reshaped;
-  bool _is_fc_after_conv;
-  bool _accumulate_biases;
-  bool _is_quantized;
-  bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
deleted file mode 100644
index 18cb61bf9..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file        NEFullyConnectedReshapingLayer.h
- * @brief       This file contains NEFullyConnectedReshapingLayer class
- * @ingroup     COM_AI_RUNTIME
- */
-
-#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
-#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
-
-#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
-#include <arm_compute/runtime/IMemoryManager.h>
-#include <arm_compute/runtime/Tensor.h>
-
-namespace arm_compute
-{
-/**
- * @brief Class to run FullyConnected Layer after reshaping input tensor
- */
-class NEFullyConnectedReshapingLayer : public arm_compute::IFunction
-{
-public:
-  enum class KernelType
-  {
-    GENERAL,             //< General FC
-    PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed
-  };
-
-public:
-  NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
-      : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
-        _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
-  {
-    // DO NOTHING
-  }
-
-public:
-  /**
-   * @brief Configure the layer
-   * @param[in] input The source tensor
-   * @param[in] weights The tensor that is filled with weight values
-   * @param[in] biases The tensor that is filled with biase values
-   * @param[in] output The destination tensor
-   * @param[in] needs_reshape Whether it needs to be reshaped or not
-   * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
-   * @param[in] kernel_type The kernel type for actual FullyConnected layer
-   * @return N/A
-   */
-  void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights,
-                 const arm_compute::ITensor *biases, arm_compute::ITensor *output,
-                 bool needs_reshape, const arm_compute::TensorShape &reshape,
-                 KernelType kernel_type);
-
-public:
-  /**
-   * @brief Run the operation. Must be called after configure().
-   * @return N/A
-   */
-  void run(void) override;
-  /**
-   * @brief Prepare the operation
-   * @return N/A
-   */
-  void prepare(void) override;
-
-private:
-  std::shared_ptr<IMemoryManager> _memory_manager;
-  const arm_compute::ITensor *_input;
-  const arm_compute::ITensor *_weights;
-  const arm_compute::ITensor *_biases;
-  arm_compute::ITensor *_output;
-
-  // buffer for reshaping input tensor
-  arm_compute::Tensor _neon_buffer;
-
-private:
-  std::unique_ptr<arm_compute::IFunction> _neon_fc;
-  NEReshapeLayer _neon_reshape;
-  bool _needs_reshape;
-};
-} // namespace arm_compute
-
-#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
deleted file mode 100644
index 155a1b837..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEGATHEREX_H__
-#define __ARM_COMPUTE_NEGATHEREX_H__
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEGatherKernelEx */
-class NEGatherEx : public INESimpleFunctionNoBorder
-{
-public:
-  /** Initialise the kernel's inputs and outputs
-   *
-   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
-   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-   * @param[out] output  Destination tensor. Data type supported: Same as @p input
-   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
-   */
-  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEGatherKernelEx
-   *
-   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
-   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
-                         const ITensorInfo *output, int axis);
-};
-
-} // namespace arm_compute
-
-#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
deleted file mode 100644
index 521a05ad9..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file NEHashtableLookup.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::NEHashtableLookup class
- */
-
-#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
-#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-class ITensor;
-
-/**
- * @brief Class to perform HashtableLookup operation
- */
-class NEHashtableLookup : public INESimpleFunctionNoBorder
-{
-public:
-  /**
-   * @brief Set the input and output tensors.
-   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
-   *                      input. Data types supported: S32
-   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
-   *                      Data types supported: S32
-   * @param[in]  input    Source tensor.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
-   *                      input.
-   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
-   *                      (True) or not (False). Data types supported: U8/QASYMM8
-   * @return N/A
-   */
-  void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
-                 ITensor *hits);
-  /** Static function to check if given info will lead to a valid configuration of @ref NECopy
-   *
-   * @param[in]  lookups  Lookups 1D tensor info.
-   *                      Data types supported: S32
-   * @param[in]  keys     Keys 1D tensor info. keys and input pair represent a map.
-   *                      Data types supported: S32
-   * @param[in]  input    Source tensor info.
-   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[in]  output   Destination tensor info. Data types and data layouts supported: Same as @p
-   * input.
-   * @param[in]  hits     Hits 1D tensor info. A boolean tensor that indicates whether the lookup
-   * hits (True) or not (False). Data types supported: U8/QASYMM8
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
-                         const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *hits);
-};
-}
-#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
deleted file mode 100644
index 18e813923..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
-#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
-
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEPermute.h"
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform a Instance normalization.
- *
- * This function runs the following kernels:
- * -# @ref NEInstanceNormalizationLayerKernelEx
- */
-class NEInstanceNormalizationLayerEx : public IFunction
-{
-public:
-  /** Constructor */
-  NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Set the input and output tensors.
-   *
-   * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will
-   * store the result of the normalization.
-   *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
-   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor.
-   * Defaults to 1.0
-   * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor.
-   * Defaults to 0.0
-   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-   */
-  void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
-                 float epsilon = 1e-12f);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEInstanceNormalizationLayer.
-   *
-   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
-   * NHWC, NCHW
-   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults
-   * to 1.0
-   * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor.
-   * Defaults to 0.0
-   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
-                         float epsilon = 1e-12f);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  MemoryGroup _memory_group;
-  NEInstanceNormalizationLayerKernelEx _normalization_kernel;
-  bool _is_nchw;
-  NEPermute _permute_input;
-  NEPermute _permute_output;
-  Tensor _permuted_input;
-  Tensor _permuted_output;
-};
-}
-#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
deleted file mode 100644
index b2ea6270f..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEONEHOT_H__
-#define __ARM_COMPUTE_NEONEHOT_H__
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-/** Basic function to run @ref NEOneHotKernel */
-class NEOneHot : public INESimpleFunctionNoBorder
-{
-public:
-  /** Initialise the kernel's inputs and outputs
-   *
-   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
-   * following types: U32/S32
-   * @param[in]  depth     The tensor for depth of the one hot dimension. Supported tensor rank: up
-   * to 3. Must be one of the following types: U32/S32
-   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
-   * U8/S8/U16/S16/F16/U32/S32/F32
-   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
-   * Same as @p on_value
-   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
-   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
-   * The value must be in range [-indices.rank , indices.rank)
-   */
-  void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
-                 const ITensor *off_value, ITensor *output, int axis = -1);
-  /** Static function to check if given info will lead to a valid configuration of @ref
- * NEOneHotKernel
-   *
- * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
- * up to 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
- * Same as @p on_value
- * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *indices, const ITensorInfo *depth,
-                         const ITensorInfo *on_value, const ITensorInfo *off_value,
-                         const ITensorInfo *output, int axis = -1);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEONEHOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
deleted file mode 100644
index 7f764b000..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
-#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/TypesEx.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform reduce operation */
-class NEReduceOperation : public IFunction
-{
-public:
-  /** Constructor */
-  NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Configure kernel
-   *
-   * @note Supported tensor rank: up to 4
-   *
-   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
-   * @param[in]  reduction_axis Reduction axis vector.
-   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
-   * @param[out] output         Destination tensor. Data type supported: Same as @p input
-   * @param[in]  op             Reduce operation to perform.
-   */
-  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output,
-                 ReduceOperation op);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEReduceOperation
-   *
-   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
-   * @param[in] reduction_axis Reduction axis vector.
-   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
-   * @param[in] output         Destination tensor. Data type supported: Same as @p input
-   * @param[in]  op             Reduce operation to perform.
-   *
-   * @return A status
-   */
-  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
-                         bool keep_dims, const ITensorInfo *output, ReduceOperation op);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  MemoryGroup _memory_group;
-  std::vector<NEReductionOperationEx> _reduction_kernels;
-  std::vector<Tensor> _reduced_outs;
-  NEReshapeLayer _reshape;
-  unsigned int _reduction_ops;
-  bool _keep_dims;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
deleted file mode 100644
index 48b416923..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__
-#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform reduce operation */
-class NEReduceSum : public IFunction
-{
-public:
-  /** Constructor */
-  NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Configure kernel
-   *
-   * @note Supported tensor rank: up to 4
-   *
-   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
-   * @param[in]  reduction_axis Reduction axis vector.
-   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
-   * @param[out] output         Destination tensor. Data type supported: Same as @p input
-   */
-  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
-                 ITensor *output);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum
-   *
-   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
-   * @param[in] reduction_axis Reduction axis vector.
-   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
-   * @param[in] output         Destination tensor. Data type supported: Same as @p input
-   *
-   * @return A status
-   */
-  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
-                         bool keep_dims, const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  MemoryGroup _memory_group;
-  std::vector<NEReductionOperation> _reduction_kernels;
-  std::vector<Tensor> _reduced_outs;
-  NEReshapeLayer _reshape;
-  unsigned int _reduction_ops;
-  bool _keep_dims;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
deleted file mode 100644
index 1693922b7..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
-#define __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to simulate a reduction operation. This function calls the following NEON
- * kernels:
- *
- * -# @ref NEFillBorderKernel
- * -# @ref NEReductionOperationKernelEx
- *
- */
-class NEReductionOperationEx : public IFunction
-{
-public:
-  /** Default constructor */
-  NEReductionOperationEx();
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32.
-   * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
-   * @param[in]  axis   Dimension along which to reduce.
-   * @param[in]  op     Reduction operation to perform.
-   */
-  void configure(ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEReductionOperationEx.
-   *
-   * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32.
-   * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
-   * input.
-   * @param[in] axis   Dimension along which to reduce.
-   * @param[in] op     Reduction operation to perform.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
-                         ReduceOperation op);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  NEReductionOperationKernelEx _reduction_kernel;
-  NEFillBorderKernel _fill_border_kernel;
-  size_t _window_split;
-  int _reduction_axis;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
deleted file mode 100644
index 24ff5dac9..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
-#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
-
-#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEReverse.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** Function to run the deconvolution layer.
- *
- * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
- * depending on the stride and pad info and then perfrom a 1x1
- * convolution pass. Input stride defines how many zeroes we should put between each element of the
- * input, pad is the amount of padding and finaly a is a user
- * specified value where a < stride - 1 that increases the padding top and right of the input image.
- *
- *  The relation between input to output is as follows:
- *  \f[
- *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
- *  \f]
- *  \f[
- *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
- *  \f]
- *
- *  where
- *      width is the size of the first input dimension.
- *      height is the size of the second input dimension.
- *      width_output is the size of the first output dimension.
- *      height_output is the size of the second output dimension.
- *      kernel_x and kernel_y are the convolution sizes in x and y.
- *      stride_x and stride_y is the input stride of the first and second dimension.
- *
- * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
- * Therefore, it will be necessary to use the weights in the
- * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
- *
- * This function calls the following NEON kernels/functions:
- *
- * -# @ref CPPUpsampleEx
- * -# @ref NEConvolutionLayer
- * -# @ref NEPermute
- * -# @ref NEReverse
- *
- */
-class NETransposeConvLayer : public IFunction
-{
-public:
-  /** Constructor */
-  NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NETransposeConvLayer(const NETransposeConvLayer &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
-  /** Allow instances of this class to be moved */
-  NETransposeConvLayer(NETransposeConvLayer &&) = default;
-  /** Allow instances of this class to be moved */
-  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
-  /** Default destructor */
-  virtual ~NETransposeConvLayer() = default;
-
-  /** Set the input, weights, biases and output tensors.
-   *
-   * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-   * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
- * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
- * for F16 input.
-   * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
- * input.
-   * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in]     invalid_right  The number of zeros added to right edge of the output.
- * @param[in]     invalid_bottom The number of zeros added to bottom edge of the output.
-   *
-   */
-  void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
-                 const PadStrideInfo &info, unsigned int invalid_right,
-                 unsigned int invalid_bottom);
-  /** Static function to check if given info will lead to a valid configuration of @ref
- * NETransposeConvLayer
-   *
-   * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-   * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
- * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
-   * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
- * input.
-   * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] innvalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *bias, const ITensorInfo *output,
-                         const PadStrideInfo &info, unsigned int invalid_right,
-                         unsigned int invalid_bottom);
-
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-  NEConvolutionLayer _conv_f;
-  CPPUpsample _upsample_f;
-  NEReverse _flip_weights;
-  Tensor _scaled_output;
-  Tensor _weights_flipped;
-  Tensor _flip_axis;
-  const ITensor *_original_weights;
-  ITensor *_input;
-  PadStrideInfo _info;
-  bool _is_prepared;
-};
-} // arm_compute
-#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/resolve_includes.py b/compute/ARMComputeEx/resolve_includes.py
deleted file mode 100755
index f37c2a957..000000000
--- a/compute/ARMComputeEx/resolve_includes.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Copyright (c) 2016, 2017 ARM Limited.
-#
-# SPDX-License-Identifier: MIT
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import collections
-import os.path
-import re
-import subprocess
-import glob
-
-
-def resolve_includes(target, source):
-    # File collection
-    FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents')
-
-    # Include pattern
-    pattern = re.compile("#include \"(.*)\"")
-
-    # Get file contents
-    files = []
-    for i in range(len(source)):
-        src = source[i]
-        dst = target[i]
-        f = open(src)
-        cts = f.read()
-        f.close()
-        contents = cts.splitlines()
-        entry = FileEntry(target_name=dst, file_contents=contents)
-        files.append((os.path.basename(src), entry))
-
-    # Create dictionary of tupled list
-    files_dict = dict(files)
-
-    # Check for includes (can only be files in the same folder)
-    final_files = []
-    for file in files:
-        done = False
-        tmp_file = file[1].file_contents
-        print(file[1].target_name)
-        while not done:
-            file_count = 0
-            updated_file = []
-            for line in tmp_file:
-                found = pattern.search(line)
-                if found:
-                    include_file = found.group(1)
-                    data = files_dict[include_file].file_contents
-                    updated_file.extend(data)
-                else:
-                    updated_file.append(line)
-                    file_count += 1
-
-            # Check if all include are replaced.
-            if file_count == len(tmp_file):
-                done = True
-
-            # Update temp file
-            tmp_file = updated_file
-
-        # Append and prepend string literal identifiers and add expanded file to final list
-        tmp_file.insert(0, "R\"(\n")
-        tmp_file.append("\n)\"")
-        entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file)
-        final_files.append((file[0], entry))
-
-    # Write output files
-    for file in final_files:
-        with open(file[1].target_name, 'w+') as out_file:
-            out_file.write("\n".join(file[1].file_contents))
-
-
-# Generate embed files
-cl_files = glob.glob('src/core/CL/cl_kernels/*.cl')
-cl_files += glob.glob('src/core/CL/cl_kernels/*.h')
-
-# DEBUG: print cl files
-print("cl_files:")
-print(cl_files)
-
-embed_files = [f + "embed" for f in cl_files]
-print("embed_files:")
-print(embed_files)
-
-resolve_includes(embed_files, cl_files)
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
deleted file mode 100644
index 81d0cb70f..000000000
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <utility>
-#include <vector>
-
-using namespace arm_compute;
-
-const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
-    // ARMComputeEx kernels
-    {"arg_min_max_ex_x", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_y", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_z", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_w", "arg_min_max_ex.cl"},
-    {"binary_logical_op", "binary_logical_op.cl"},
-    {"cast_bool", "cast.cl"},
-    {"embedding_lookup", "embedding_lookup.cl"},
-    {"gather_ex", "gather_ex.cl"},
-    {"gather_ex_1d", "gather_ex.cl"},
-    {"gather_ex_1d_out", "gather_ex.cl"},
-    {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
-    {"hashtable_lookup", "hashtable_lookup.cl"},
-    {"instance_normalization_ex", "instance_normalization_ex.cl"},
-    {"multiply_scale_factor", "multiply_scale_factor.cl"},
-    {"neg_tensor", "neg_tensor.cl"},
-    {"one_hot", "one_hot.cl"},
-    {"one_hot_only_on_value", "one_hot.cl"},
-    {"quantization_symm8", "quantization_symm8.cl"},
-    {"reduce_min_max", "reduce_operation.cl"},
-    {"reduce_sum_mean", "reduce_operation.cl"},
-    {"topkv2_init", "topkv2.cl"},
-    {"topkv2_find_first_negative", "topkv2.cl"},
-    {"topkv2_reorder_negatives", "topkv2.cl"},
-    {"topkv2_store", "topkv2.cl"},
-    {"radixsort_histogram", "topkv2_radixsort.cl"},
-    {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
-    {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
-    {"radixsort_reorder", "topkv2_radixsort.cl"},
-    {"topkv2_quicksort", "topkv2_quicksort.cl"},
-    {"scale_factor_symm8", "scale_factor.cl"},
-};
-
-const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
-#ifdef EMBEDDED_KERNELS
-    {
-        "arg_min_max_ex.cl",
-#include "./cl_kernels/arg_min_max_ex.clembed"
-    },
-    {
-        "cast.cl",
-#include "./cl_kernels/cast.clembed"
-    },
-    {
-        "embedding_lookup.cl",
-#include "./cl_kernels/embedding_lookup.clembed"
-    },
-    {
-        "gather_ex.cl",
-#include "./cl_kernels/gather_ex.clembed"
-    },
-    {
-        "gemmlowp_ex.cl",
-#include "./cl_kernels/gemmlowp_ex.clembed"
-    },
-    {
-        "hashtable_lookup.cl",
-#include "./cl_kernels/hashtable_lookup.clembed"
-    },
-    {
-        "helpers.h",
-#include "./cl_kernels/helpers.hembed"
-    },
-    {
-        "helpers_asymm.h",
-#include "./cl_kernels/helpers_asymm.hembed"
-    },
-    {
-        "instance_normalization_ex.cl",
-#include "./cl_kernels/instance_normalization_ex.clembed"
-    },
-    {
-        "binary_logical_op.cl",
-#include "./cl_kernels/binary_logical_op.clembed"
-    },
-    {
-        "multiply_scale_factor.cl",
-#include "./cl_kernels/multiply_scale_factor.clembed"
-    },
-    {
-        "neg_tensor.cl",
-#include "./cl_kernels/neg_tensor.clembed"
-    },
-    {
-        "one_hot.cl",
-#include "./cl_kernels/one_hot.clembed"
-    },
-    {
-        "quantization_symm8.cl",
-#include "./cl_kernels/quantization_symm8.clembed"
-    },
-    {
-        "reduce_operation.cl",
-#include "./cl_kernels/reduce_operation.clembed"
-    },
-    {
-        "scale_factor.cl",
-#include "./cl_kernels/scale_factor.clembed"
-    },
-    {
-        "topkv2.cl",
-#include "./cl_kernels/topkv2.clembed"
-    },
-    {
-        "topkv2_radixsort.cl",
-#include "./cl_kernels/topkv2_radixsort.clembed"
-    },
-    {
-        "topkv2_quicksort.cl",
-#include "./cl_kernels/topkv2_quicksort.clembed"
-    },
-
-#endif /* EMBEDDED_KERNELS */
-};
-
-CLKernelLibraryEx::CLKernelLibraryEx()
-    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
-{
-  opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
-                         // CLKernelLibraryEx is built
-}
-
-CLKernelLibraryEx &CLKernelLibraryEx::get()
-{
-  static CLKernelLibraryEx _kernel_library;
-  return _kernel_library;
-}
-
-Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
-                                        const StringSet &build_options_set) const
-{
-  // Find which program contains the kernel
-  auto kernel_program_it = _kernel_program_map.find(kernel_name);
-
-  if (_kernel_program_map.end() == kernel_program_it)
-  {
-    ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
-  }
-  std::string concat_str;
-
-  if (fp16_supported())
-  {
-    concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
-  }
-
-  if (get_cl_version(_device) == CLVersion::CL20)
-  {
-    concat_str += " -cl-std=CL2.0 ";
-  }
-  else if (arm_non_uniform_workgroup_supported(_device))
-  {
-    concat_str += " -cl-arm-non-uniform-work-group-size ";
-  }
-  else
-  {
-    ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
-  }
-
-  // Check if the program has been built before with same build options.
-  const std::string program_name = kernel_program_it->second;
-  const std::string build_options = stringify_set(build_options_set) + concat_str;
-
-  const std::string built_program_name = program_name + "_" + build_options;
-  auto built_program_it = _built_programs_map.find(built_program_name);
-
-  cl::Program cl_program;
-
-  if (_built_programs_map.end() != built_program_it)
-  {
-    // If program has been built, retrieve to create kernel from it
-    cl_program = built_program_it->second;
-  }
-  else
-  {
-    // Get program
-    Program program = load_program(program_name);
-
-    // Build program
-    cl_program = program.build(build_options);
-
-    // Add built program to internal map
-    _built_programs_map.emplace(built_program_name, cl_program);
-  }
-
-  // Create and return kernel
-  return Kernel(kernel_name, cl_program);
-}
-
-void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
-                                          cl::Program program)
-{
-  _built_programs_map.emplace(built_program_name, program);
-}
-
-bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); }
-
-bool CLKernelLibraryEx::int64_base_atomics_supported() const
-{
-  return device_supports_extension(_device, "cl_khr_int64_base_atomics");
-}
-
-const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
-{
-  const auto program_it = _programs_map.find(program_name);
-
-  if (program_it != _programs_map.end())
-  {
-    return program_it->second;
-  }
-
-  Program program;
-
-#ifdef EMBEDDED_KERNELS
-  const auto program_source_it = _program_source_map.find(program_name);
-
-  if (_program_source_map.end() == program_source_it)
-  {
-    ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
-  }
-
-  program = Program(_context, program_name, program_source_it->second);
-#else  /* EMBEDDED_KERNELS */
-  // Check for binary
-  std::string source_name = _kernel_path + program_name;
-  std::string binary_name = source_name + "bin";
-
-  if (std::ifstream(binary_name).is_open())
-  {
-    const std::string program_binary = read_file(binary_name, true);
-    program = Program(_context, _device, program_name,
-                      std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
-  }
-  else if (std::ifstream(source_name).is_open())
-  {
-    program = Program(_context, program_name, read_file(source_name, false));
-  }
-  else
-  {
-    ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
-  }
-#endif /* EMBEDDED_KERNELS */
-
-  // Insert program to program map
-  const auto new_program = _programs_map.emplace(program_name, std::move(program));
-
-  return new_program.first->second;
-}
-
-std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const
-{
-  std::string concat_set;
-
-#ifndef EMBEDDED_KERNELS
-  concat_set += "-I" + _kernel_path + " ";
-#endif /* EMBEDDED_KERNELS */
-
-  // Concatenate set
-  for (const auto &el : s)
-  {
-    concat_set += " " + el;
-  }
-
-  return concat_set;
-}
-
-std::string CLKernelLibraryEx::get_program_source(const std::string &program_name)
-{
-  const auto program_source_it = _program_source_map.find(program_name);
-
-  if (program_source_it == _program_source_map.end())
-  {
-    ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
-  }
-
-  return program_source_it->second;
-}
-
-size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const
-{
-  size_t result;
-
-  size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
-  ARM_COMPUTE_ERROR_ON_MSG(
-      err != 0,
-      "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
-  ARM_COMPUTE_UNUSED(err);
-
-  return result;
-}
-
-cl::NDRange CLKernelLibraryEx::default_ndrange() const
-{
-  //    GPUTarget   _target = get_target_from_device(_device);
-  cl::Device device = cl::Device::getDefault();
-  GPUTarget _target = get_target_from_device(device);
-  cl::NDRange default_range;
-
-  switch (_target)
-  {
-    case GPUTarget::MIDGARD:
-    case GPUTarget::T600:
-    case GPUTarget::T700:
-    case GPUTarget::T800:
-      default_range = cl::NDRange(128u, 1);
-      break;
-    default:
-      default_range = cl::NullRange;
-  }
-
-  return default_range;
-}
-
-std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); }
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
deleted file mode 100644
index 0a014d15c..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
+++ /dev/null
@@ -1,565 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) isgreater(x, y)
-#define ISLESS(x, y) isless(x, y)
-#else // !FLOAT_DATA_TYPE
-#if defined(WIDTH)
-#define ISGREATER(x, y) (x > y) ? 1 : 0
-#define ISLESS(x, y) (x < y) ? 1 : 0
-#else // !defined(WIDTH)
-#define ISGREATER(x, y) \
-  select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
-#define ISLESS(x, y) \
-  select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
-#endif // defined(WIDTH)
-#endif // defined(FLOAT_DATA_TYPE)
-
-#if defined(ARG_MAX)
-#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
-#elif defined(ARG_MIN)
-#define CONDITION_TO_USE(x, y) ISLESS(x, y)
-#else // !(defined(ARG_MAX) || defined(ARG_MIN))
-#error "Unsupported reduction operation!"
-#endif // defined(ARG_MAX)
-
-#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
-#if defined(WIDTH)
-#if defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input,
-                                             __global const DATA_TYPE_OUTPUT *prev_res,
-                                             const int x_idx)
-{
-  int end_elem = (x_idx + 1) * 16;
-  if (end_elem > WIDTH)
-  {
-    end_elem = WIDTH - x_idx * 16;
-  }
-  DATA_TYPE_OUTPUT res = prev_res[0];
-  for (int x_v = 1; x_v < end_elem; ++x_v)
-  {
-    res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res));
-  }
-  return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-  DATA_TYPE_OUTPUT res = 0;
-  for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-  {
-    res = select(res, x_v, *(input + x_v) < *(input + res));
-  }
-  return res;
-#else  // WIDTH >= 16
-  int x_elem = x_idx * 16;
-  const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-  x_elem -= x_goback;
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  in = vload16(0, input - x_goback);
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-
-  VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
-  idx_sel = (in.s01234567 <= in.s89abcdef);
-  in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
-  res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-  idx_sel.s0123 = (in.s0123 < in.s4567) ||
-                  (in.s0123 == in.s4567 &&
-                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
-  in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
-  res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-  idx_sel.s01 =
-      (in.s01 < in.s23) ||
-      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
-  in.s01 = select(in.s23, in.s01, idx_sel.s01);
-  res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-  idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
-  res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-  return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MIN)
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input,
-                                             __global const DATA_TYPE_OUTPUT *prev_res,
-                                             const int x_idx)
-{
-  int end_elem = (x_idx + 1) * 16;
-  if (end_elem > WIDTH)
-  {
-    end_elem = WIDTH - x_idx * 16;
-  }
-  DATA_TYPE_OUTPUT res = prev_res[0];
-  unsigned int res_int = res;
-  DATA_TYPE_OUTPUT condition_check2;
-  for (int x_v = 1; x_v < end_elem; ++x_v)
-  {
-    int i1 = prev_res[x_v];
-    condition_check2 = *(input + i1) > *(input + res_int);
-    res = select(res, prev_res[x_v], condition_check2);
-  }
-  return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-  DATA_TYPE_OUTPUT res = 0;
-  unsigned int i1;
-  unsigned int i2;
-  DATA_TYPE_OUTPUT condition_check;
-  for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-  {
-    i1 = x_v;
-    i2 = res;
-    condition_check = *(input + i1) > *(input + i2);
-    res = select(res, x_v, condition_check);
-  }
-  return res;
-#else  // WIDTH >= 16
-  int x_elem = x_idx * 16;
-  const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-  x_elem -= x_goback;
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  in = vload16(0, input - x_goback);
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-
-  VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
-  idx_sel = (in.s01234567 >= in.s89abcdef);
-  in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
-  res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-  idx_sel.s0123 = (in.s0123 > in.s4567) ||
-                  (in.s0123 == in.s4567 &&
-                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
-  in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
-  res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-  idx_sel.s01 =
-      (in.s01 > in.s23) ||
-      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
-  in.s01 = select(in.s23, in.s01, idx_sel.s01);
-  res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-  idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
-  res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-  return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX)
-
-/** This kernel performs parallel reduction given an operation on x-axis.
- *
- * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed
- * using -DPREV_OUTPUT
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
- * -DDATA_TYPE_OUTPUT=uint
- * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the
- * ArgMax
- * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the
- * ArgMin
- *
- * @param[in] src_ptr                                   Pointer to the source tensor. Supported data
- * types: S32/F16/F32
- * @param[in] src_stride_x                              Stride of the source tensor in X dimension
- * (in bytes)
- * @param[in] src_step_x                                src_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] src_stride_y                              Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in] src_step_y                                src_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the
- * source tensor
- * @param[in] prev_res_ptr                              (Optional) Pointer to previous results
- * tensor. Supported data types: U32/S32
- * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X
- * dimension (in bytes)
- * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of
- * elements along X processed per workitem(in bytes)
- * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y
- * dimension (in bytes)
- * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of
- * elements along Y processed per workitem(in bytes)
- * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element
- * in the previous results tensor
- * @param[in] partial_res_ptr                           The local buffer to hold partial result
- * values. Supported data types: U32/S32
- * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension
- * (in bytes)
- * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements
- * along X processed per workitem(in bytes)
- * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension
- * (in bytes)
- * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements
- * along Y processed per workitem(in bytes)
- * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the
- * source tensor
- * @param[in] local_results                             Local buffer for storing the partial result
- */
-__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
-#if defined(PREV_OUTPUT)
-                               IMAGE_DECLARATION(prev_res),
-#endif // defined(PREV_OUTPUT)
-                               IMAGE_DECLARATION(partial_res),
-                               __local DATA_TYPE_OUTPUT *local_results)
-{
-#if defined(PREV_OUTPUT)
-  Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
-  Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
-#else  // !defined(PREV_OUTPUT)
-  Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#endif // defined(PREV_OUTPUT)
-  Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
-
-  unsigned int lsize = get_local_size(0);
-  unsigned int lid = get_local_id(0);
-
-  const uint x_idx = get_global_id(0);
-  const uint y_idx = get_global_id(1);
-  const __global DATA_TYPE *src_in_row =
-      (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes +
-                                   y_idx * src_step_y);
-
-  for (unsigned int y = 0; y < get_local_size(1); ++y)
-  {
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_max_prev_out(
-        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#else  // defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_min_prev_out(
-        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Looking for the next highest power of 2 (maximum value of lsize is 8)
-    unsigned int middle = lsize - 1;
-    middle |= middle >> 1;
-    middle |= middle >> 2;
-    middle += 1;
-    // Perform parallel reduction
-    DATA_TYPE_OUTPUT condition_check3;
-    for (unsigned int i = middle; i > 0; i >>= 1)
-    {
-      if (lid < i && lid + i < lsize)
-      {
-        DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
-        DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
-#if defined(ARG_MAX)
-        condition_check3 =
-            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
-        local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
-#else  // defined(ARG_MIN)
-        local_results[lid] = select(
-            local_results[lid], local_results[lid + i],
-            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-      }
-      barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (lid == 0)
-    {
-      ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
-    }
-  }
-}
-#endif // defined(WIDTH)
-
-#if defined(HEIGHT)
-/** This kernel performs reduction on y-axis.
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g.
- * -DDATA_TYPE=float
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
- * -DDATA_TYPE_OUTPUT=uint
- * @note The data type of the select results must be passed at compile time using
- * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
- *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data
- * types: S32/F16/F32
- * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in] src_step_x                           src_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in
- * bytes)
- * @param[in] src_step_y                           src_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source
- * tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
- * data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
- * bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
- * bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- */
-__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output))
-{
-  Image src = CONVERT_TO_IMAGE_STRUCT(src);
-  Image output = CONVERT_TO_IMAGE_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  indx = 0;
-  for (unsigned int y = 1; y < HEIGHT; ++y)
-  {
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in =
-        CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-    indx = select(indx, y, cond_conv);
-    res = select(res, in, CONDITION_TO_USE(in, res));
-  }
-
-  // Store result
-  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif // defined(HEIGHT)
-
-#if defined(DEPTH)
-/** This kernel performs reduction on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the select results must be passed at compile time using
- * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data
- * types: S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
- * bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
- * tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
- * data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
- * bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
- * bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- */
-__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)),
-                VEC_DATA_TYPE(DATA_TYPE, 16));
-
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  indx = 0;
-  for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
-  {
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)),
-                 VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-    indx = select(indx, z, cond_conv);
-    res = select(res, in, CONDITION_TO_USE(in, res));
-  }
-
-  // Store result
-  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif /* defined(DEPTH) */
-
-#if defined(BATCH) && defined(DEPTH)
-/** This kernel performs reduction on w-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the select results must be passed at compile time using
- * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data
- * types: S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
- * bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
- * tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
- * data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
- * bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
- * bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- */
-__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
-  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)),
-                VEC_DATA_TYPE(DATA_TYPE, 16));
-
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  indx = 0;
-  for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
-  {
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)),
-                 VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-    indx = select(indx, w, cond_conv);
-    res = select(res, in, CONDITION_TO_USE(in, res));
-  }
-
-  // Store result
-  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif /* defined(BATCH) && defined(DEPTH) */
-#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
deleted file mode 100644
index e249663bc..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(OP_CODE) && defined(DATA_TYPE)
-/** returns truth value of the two input tensors for BINARY LOGICAL OP.
- *  where BINARY LOGICAL OP can be AND, OR.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size.
- *            e.g. -DVEC_SIZE=16
- * @attention Operation type(code) specifying which operation to perform should be passed as
- *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input1_ptr                            Pointer to the source tensor.
- *                                                   Supported data types: QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension
- *                                                   (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension
- *                                                   (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension
- *                                                   (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                   tensor
- * @param[in]  input2_ptr                            Pointer to the source tensor.
- *                                                   Supported data types: QASYMM8
- * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension
- *                                                   (in bytes)
- * @param[in]  input2_step_x                         input2_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension
- *                                                   (in bytes)
- * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension
- *                                                   (in bytes)
- * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                   tensor
- * @param[out] output_ptr                            Pointer to the destination tensor.
- *                                                   Supported data types: QASYMM8
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
- *                                                   (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- */
-__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2),
-                                TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
-  Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if OP_CODE == 1 // LOGICAL AND
-  VSTORE(VEC_SIZE)
-  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) &&
-               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
-           VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-   0, (__global DATA_TYPE *)output.ptr);
-
-#elif OP_CODE == 2 // LOGICAL OR
-  VSTORE(VEC_SIZE)
-  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) ||
-               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
-           VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-   0, (__global DATA_TYPE *)output.ptr);
-
-#else // OP NOT SUPPORTED
-  return
-
-#endif
-}
-#endif // if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 3b0a175a4..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function performs a up-scaling depth conversion for boolean type input.
- *
- * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and
- * -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @note The integer shift amount value need to be passed at compile time using -DSHIFT:
- * e.g. -DSHIFT=7
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types:
- * U8
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in
- * bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in
- * bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed
- * per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data
- * types: U8/S8/U16/S16/U32/S32/F16/F32
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
- * bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in
- * bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
- * per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
- * image
- */
-__kernel void cast_bool(TENSOR3D_DECLARATION(in), TENSOR3D_DECLARATION(out))
-{
-  // Get pixels pointer
-  Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
-  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-  // Load data
-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(in_data & 1, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
-   (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
deleted file mode 100644
index 92e5dfbee..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
-/** Perform embedding_lookup of input tensor
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
- *       -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention Output tensor depth should be given as a preprocessor argument using
- *            -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
- * @attention Number of input dimensions are passed as a preprocessor argument using
- *            -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
- *                                                   types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
- *                                                   bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
- *                                                   bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source
- *                                                   tensor
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in
- *                                                   bytes)
- * @param[in]  input_step_w                          output_stride_w * number of elements along W
- *                                                   processed per workitem(in bytes)
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in
- *                                                   bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
- *                                                   destination tensor
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data
- *                                                   types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in
- *                                                   bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups
- *                                                   vector
- */
-
-__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
-                               VECTOR_DECLARATION(lookups))
-{
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
-
-  Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-
-  // lookup ids for based on the tensor dimensions
-  int lup_id[4] = {0};
-
-  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
-                              : get_global_id(0);
-  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
-                              : get_global_id(1);
-  lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
-                              : get_global_id(2) % DEPTH_OUT;
-  lup_id[3] = (NUM_DIMS == 4)
-                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                  : get_global_id(2) / DEPTH_OUT;
-
-  in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
-            lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
-   (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
deleted file mode 100644
index 2236021f1..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
-
-/** Performs the Gather operation along the chosen axis
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
- *       -DDATA_TYPE=short
- * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
- * @attention Output tensor depth should be given as a preprocessor argument using
- *            -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
- * @attention Input tensor depth should be given as a preprocessor argument using
- *            -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
- *                                                   types: U8/S8/U16/S16/U32/S32/F16/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
- *                                                   bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X
- *                                                   processed per work item (in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
- *                                                   bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y
- *                                                   processed per work item (in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in
- *                                                   bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z
- *                                                   processed per work item (in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W
- *                                                   processed per work item (in bytes)
- * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source
- *                                                   tensor
- * @param[in]  indices_ptr                           Pointer to the source tensor. Supported data
- *                                                   types: S32
- * @param[in]  indices_stride_x                      Stride of the source tensor in X dimension (in
- *                                                   bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X
- *                                                   processed per workitem(in  bytes)
- * @param[in]  indices_stride_y                      Stride of the source tensor in Y dimension (in
- *                                                   bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y
- *                                                   processed per workitem(in  bytes)
- * @param[in]  indices_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z
- *                                                   processed per workitem(in  bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the
- *                                                   destination tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per work item (in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per work item (in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
- *                                                   (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per work item (in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension
- *                                                   (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W
- *                                                   processed per work item (in bytes)
- * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination
- *                                                   tensor
- */
-__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices),
-                        TENSOR4D_DECLARATION(output))
-{
-  const int px = get_global_id(0);
-  const int py = get_global_id(1);
-  const int pz = get_global_id(2) % OUTPUT_DIM_Z;
-  const int pw = get_global_id(2) / OUTPUT_DIM_Z;
-
-  const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
-  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
-  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
-
-#if AXIS == 0
-#if INDICES_DIM == 1
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
-  __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
-#elif INDICES_DIM == 2
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
-  __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
-#elif INDICES_DIM == 3
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
-  __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
-#endif
-#elif AXIS == 1
-#if INDICES_DIM == 1
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
-  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
-#elif INDICES_DIM == 2
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
-  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
-#elif INDICES_DIM == 3
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
-  __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
-#endif
-#elif AXIS == 2
-#if INDICES_DIM == 1
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
-  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
-#elif INDICES_DIM == 2
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
-  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
-#endif
-#elif AXIS == 3
-#if INDICES_DIM == 1
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
-  __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
-#endif
-#endif // AXIS
-
-  *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
-}
-
-#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
deleted file mode 100644
index 80ba73d1d..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \
-    defined(COLS_A)
-#define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-#define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B
- * (src1) in case both matrices have not beed reshaped
- *
- * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
- *
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
- * information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
- * tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type:
- * QASYMM8
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
- * bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
- * bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
- * matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type:
- * same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
- * bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
- * bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
- * matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
- * type: S32
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
- * (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
- * (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
- * matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
- * bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
- * bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
- * (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
- * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
- * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
-                                     IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z,
-                                     uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                     ,
-                                     uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                     ,
-                                     uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                     )
-{
-  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
-  // Compute starting address for matrix A and Matrix B
-  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-  // Update address for the matrix A
-  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
-  // Update address for the matrix B
-  src_addr.s1 += idx;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
-  // the z dimension
-  // in order to take into account the presence of possible cross plane paddings
-  //
-  //  |                  |
-  //  |      plane0      |
-  //  |                  |
-  //  |__________________|
-  //  |******************|
-  //  |  cross_plane_pad |
-  //  |******************|
-  //  |                  |
-  //  |      plane1      |
-  //  |                  |
-  //  |__________________|
-
-  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
-  // by HEIGHT_GEMM3D
-  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
-              (uint4)HEIGHT_GEMM3D;
-  zin = min(DEPTH_GEMM3D - 1, zin);
-
-  // Add offset due to the cross plane paddings
-  zin *= (src_cross_plane_pad * src0_stride_y);
-
-  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-  // multiply src0_stride_z by DEPTH_GEMM3D
-  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-  // Add offset for batched GEMM
-  src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-  src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-  int end_row_vec_a = src_addr.s0 + COLS_A;
-
-  VECTOR_INT acc0 = 0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-  VECTOR_INT acc1 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-  VECTOR_INT acc2 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-  VECTOR_INT acc3 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-  VECTOR_INT acc4 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-
-  for (; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
-  {
-    // Load values from matrix A
-    char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    char2 a4 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    // Load values from matrix B
-    VECTOR_CHAR b0 =
-        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
-    VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
-        0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
-
-    // Accumulate
-    acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0;
-    acc0 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a0.s1;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1.s0;
-    acc1 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a1.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2.s0;
-    acc2 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a2.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3.s0;
-    acc3 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a3.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4.s0;
-    acc4 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a4.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-  }
-
-  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
-  {
-    // Load values from matrix A
-    char a0 = *(__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    char a1 = *(__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    char a2 = *(__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    char a3 = *(__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    char a4 = *(__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    // Load values from matrix B
-    VECTOR_CHAR b0 =
-        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
-
-    // Accumulate
-    acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-  }
-
-  const int z = get_global_id(2);
-
-  // Compute destination address
-  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
-  // the z dimension
-  // in order to take into account the presence of possible cross plane paddings
-  //
-  //  |                  |
-  //  |      plane0      |
-  //  |                  |
-  //  |__________________|
-  //  |******************|
-  //  |  cross_plane_pad |
-  //  |******************|
-  //  |                  |
-  //  |      plane1      |
-  //  |                  |
-  //  |__________________|
-
-  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
-  // by HEIGHT_GEMM3D
-  uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) +
-                (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
-               (uint8)HEIGHT_GEMM3D;
-  zout = min(DEPTH_GEMM3D - 1, zout);
-
-  // Add offset due to the cross plane paddings
-  zout *= (dst_cross_plane_pad * dst_stride_y);
-
-  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-  // multiply dst_stride_z by DEPTH_GEMM3D
-  dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
-
-  // Store the result
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-  // Add offset for batched GEMM
-  dst.ptr += z * dst_stride_z;
-
-  // Store the result
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-  (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-}
-#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) &&
-       // defined(COLS_A)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
deleted file mode 100644
index a4f7dbd48..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
-/** Perform hashtable_lookup of input tensor
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
- *       -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention Output tensor depth should be given as a preprocessor argument using
- *            -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
- * @attention Number of input dimensions are passed as a preprocessor argument using
- *            -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
- *                                                   types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
- *                                                   bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
- *                                                   bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source
- *                                                   tensor
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in
- *                                                   bytes)
- * @param[in]  input_step_w                          output_stride_w * number of elements along W
- *                                                   processed per workitem(in bytes)
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in
- *                                                   bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
- *                                                   destination tensor
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data
- *                                                   types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in
- *                                                   bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups
- *                                                   vector
- */
-__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
-                               VECTOR_DECLARATION(lookups))
-{
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
-
-  Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-
-  int lup_id[4] = {0};
-
-  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
-                              : get_global_id(0);
-  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
-                              : get_global_id(1);
-  lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
-                              : get_global_id(2) % DEPTH_OUT;
-  lup_id[3] = (NUM_DIMS == 4)
-                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                  : get_global_id(2) / DEPTH_OUT;
-
-  if (lup_id[NUM_DIMS - 1] < 0)
-  {
-    VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
-    return;
-  }
-
-  in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
-            lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
-   (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
deleted file mode 100644
index e07a25ec9..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ /dev/null
@@ -1,571 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HELPER_H
-#define ARM_COMPUTE_HELPER_H
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
-    defined(cl_arm_integer_dot_product_accumulate_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
-       // defined(cl_arm_integer_dot_product_accumulate_int8)
-
-#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
-#pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
-
-#define GPU_ARCH_MIDGARD 0x100
-#define GPU_ARCH_BIFROST 0x200
-
-/** Concatenate two inputs.
- *
- * @param[in] a The first input to be concatenated
- * @param[in] b The second input to be concatenated
- *
- * @return The concatenated output
- */
-#define CONCAT(a, b) a##b
-
-/** Expand the given vector
- *
- * @param[in] x The vector to be expanded
- *
- * @return The expanded output
- */
-#define EXPAND(x) x
-
-/** Clamp the given value between an upper and lower bound.
- *
- * @param[in] x       The value to be clamped
- * @param[in] min_val The lower bound
- * @param[in] max_val The upper bound
- *
- * @return The clamped value.
- */
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-
-/** REVn reverses the given vector whose size is n.
- * @name REVn
- *
- * @param[in] x The vector to be reversed
- *
- * @return The reversed vector
- * @{
- */
-#define REV1(x) ((x))
-#define REV2(x) ((x).s10)
-#define REV3(x) ((x).s210)
-#define REV4(x) ((x).s3210)
-#define REV8(x) ((x).s76543210)
-#define REV16(x) ((x).sFEDCBA9876543210)
-/** @} */ // end of group REVn
-
-/** Reverse the given vector.
- * @name REVERSE
- *
- * @param[in] x The vector to be reversed
- * @param[in] s The size of the vector
- *
- * @return The reversed vector
- * @{
- */
-#define REVERSE_STR(x, s) REV##s((x))
-#define REVERSE(x, s) REVERSE_STR(x, s)
-/** @} */ // end of group REVERSE
-
-/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
- * @name ROTs_n
- *
- * @param[in] x The vector to be shifted
- *
- * @return The shifted vector
- * @{
- */
-#define ROT1_0(x) ((x))
-
-#define ROT2_0(x) ((x))
-#define ROT2_1(x) ((x).s10)
-
-#define ROT3_0(x) ((x))
-#define ROT3_1(x) ((x).s201)
-#define ROT3_2(x) ((x).s120)
-
-#define ROT4_0(x) ((x))
-#define ROT4_1(x) ((x).s3012)
-#define ROT4_2(x) ((x).s2301)
-#define ROT4_3(x) ((x).s1230)
-
-#define ROT8_0(x) ((x))
-#define ROT8_1(x) ((x).s70123456)
-#define ROT8_2(x) ((x).s67012345)
-#define ROT8_3(x) ((x).s56701234)
-#define ROT8_4(x) ((x).s45670123)
-#define ROT8_5(x) ((x).s34567012)
-#define ROT8_6(x) ((x).s23456701)
-#define ROT8_7(x) ((x).s12345670)
-
-#define ROT16_0(x) ((x))
-#define ROT16_1(x) ((x).sF0123456789ABCDE)
-#define ROT16_2(x) ((x).sEF0123456789ABCD)
-#define ROT16_3(x) ((x).sDEF0123456789ABC)
-#define ROT16_4(x) ((x).sCDEF0123456789AB)
-#define ROT16_5(x) ((x).sBCDEF0123456789A)
-#define ROT16_6(x) ((x).sABCDEF0123456789)
-#define ROT16_7(x) ((x).s9ABCDEF012345678)
-#define ROT16_8(x) ((x).s89ABCDEF01234567)
-#define ROT16_9(x) ((x).s789ABCDEF0123456)
-#define ROT16_10(x) ((x).s6789ABCDEF012345)
-#define ROT16_11(x) ((x).s56789ABCDEF01234)
-#define ROT16_12(x) ((x).s456789ABCDEF0123)
-#define ROT16_13(x) ((x).s3456789ABCDEF012)
-#define ROT16_14(x) ((x).s23456789ABCDEF01)
-#define ROT16_15(x) ((x).s123456789ABCDEF0)
-/** @} */ // end of group ROTs_n
-
-/** Circular-right-shift (rotate-right) the given vector by the given amount.
- * @name ROTATE
- *
- * @param[in] x The vector to be shifted
- * @param[in] s The size of the vector
- * @param[in] n The amount to be shifted
- *
- * @return The shifted vector
- * @{
- */
-#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
-#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
-/** @} */ // end of group ROTATE
-
-/** Creates a vector of size n filled with offset values corresponding to the location of each
- * element.
- * @name V_OFFSn
- *
- * @param[in] dt The data type of the output vector
- *
- * @return The vector filled with offset values
- * @{
- */
-#define V_OFFS1(dt) (dt)(0)
-#define V_OFFS2(dt) (dt)(0, 1)
-#define V_OFFS3(dt) (dt)(0, 1, 3)
-#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
-#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
-/** @} */ // end of group V_OFFSn
-
-/** Create a vector filled with offset values corresponding to the location of each element.
- * @name VEC_OFFS
- *
- * @param[in] dt The data type of the output vector
- * @param[in] s  The size of the output vector
- *
- * @return The vector filled with offset values
- * @{
- */
-#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
-#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
-/** @} */ // end of group VEC_OFFS
-
-#define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
-
-#define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
-
-#define float1 float
-#define half1 half
-#define char1 char
-#define uchar1 uchar
-#define short1 short
-#define ushort1 ushort
-#define int1 int
-#define uint1 uint
-#define long1 long
-#define ulong1 ulong
-#define double1 double
-
-#define vload1(OFFSET, PTR) *(OFFSET + PTR)
-#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
-
-// Convert built-in functions with _sat modifier are not supported in floating point so we create
-// defines
-// without _sat to overcome this issue
-#define convert_float_sat convert_float
-#define convert_float1_sat convert_float
-#define convert_float2_sat convert_float2
-#define convert_float3_sat convert_float3
-#define convert_float4_sat convert_float4
-#define convert_float8_sat convert_float8
-#define convert_float16_sat convert_float16
-#define convert_half_sat convert_float
-#define convert_half1_sat convert_half
-#define convert_half2_sat convert_half2
-#define convert_half3_sat convert_half3
-#define convert_half4_sat convert_half4
-#define convert_half8_sat convert_half8
-#define convert_half16_sat convert_half16
-
-#define convert_float1 convert_float
-#define convert_half1 convert_half
-#define convert_char1 convert_char
-#define convert_uchar1 convert_uchar
-#define convert_short1 convert_short
-#define convert_ushort1 convert_ushort
-#define convert_int1 convert_int
-#define convert_uint1 convert_uint
-#define convert_long1 convert_long
-#define convert_ulong1 convert_ulong
-#define convert_double1 convert_double
-
-#define convert_char1_sat convert_char_sat
-#define convert_uchar1_sat convert_uchar_sat
-#define convert_short1_sat convert_short_sat
-#define convert_ushort1_sat convert_ushort_sat
-#define convert_int1_sat convert_int_sat
-#define convert_uint1_sat convert_uint_sat
-#define convert_long1_sat convert_long_sat
-#define convert_ulong1_sat convert_ulong_sat
-#define convert_double1_sat convert_double_sat
-
-#define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-
-#define CL_VEC_DATA_TYPE_STR(type, size) type##size
-#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
-
-#define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
-
-#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
-
-#define VECTOR_DECLARATION(name)                                        \
-  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
-      uint name##_offset_first_element_in_bytes
-
-#define IMAGE_DECLARATION(name)                                                               \
-  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_offset_first_element_in_bytes
-
-#define TENSOR3D_DECLARATION(name)                                                            \
-  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_stride_z, uint name##_step_z,                           \
-      uint name##_offset_first_element_in_bytes
-
-#define TENSOR4D_DECLARATION(name)                                                            \
-  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,     \
-      uint name##_step_w, uint name##_offset_first_element_in_bytes
-
-#define CONVERT_TO_VECTOR_STRUCT(name)                                                          \
-  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                             name##_step_x)
-
-#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
-  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
-
-#define CONVERT_TO_IMAGE_STRUCT(name)                                                          \
-  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                            name##_step_x, name##_stride_y, name##_step_y)
-
-#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name)                                                     \
-  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
-                            name##_stride_y, 0)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
-  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
-                                          name##_stride_x, name##_step_x, name##_stride_y,  \
-                                          name##_step_y, name##_stride_z, name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                             \
-  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes,        \
-                                          name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
-                                          name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
-  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
-                                          name##_stride_x, name##_step_x, name##_stride_y,  \
-                                          name##_step_y, name##_stride_z, name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                          \
-  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
-                               name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                  \
-  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                               0, name##_stride_y, 0, name##_stride_z, 0)
-
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                \
-  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
-                               name##_step_z, name##_stride_w, name##_step_w, mod_size)
-
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                        \
-  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                               0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0,     \
-                               mod_size)
-
-/** Structure to hold Vector information */
-typedef struct Vector
-{
-  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
-  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-} Vector;
-
-/** Structure to hold Image information */
-typedef struct Image
-{
-  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
-  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-} Image;
-
-/** Structure to hold 3D tensor information */
-typedef struct Tensor3D
-{
-  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
-  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
-} Tensor3D;
-
-/** Structure to hold 4D tensor information */
-typedef struct Tensor4D
-{
-  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
-  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
-  int stride_w;                      /**< Stride of the image in W dimension (in bytes) */
-} Tensor4D;
-
-/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
- * data.
- *
- * @param[in] ptr                           Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
- * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per
- * workitem(in bytes)
- *
- * @return An image object
- */
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
-                                         uint stride_x, uint step_x)
-{
-  Vector vector = {
-      .ptr = ptr,
-      .offset_first_element_in_bytes = offset_first_element_in_bytes,
-      .stride_x = stride_x,
-  };
-  vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
-  return vector;
-}
-
-/** Wrap image information into an Image structure, and make the pointer point at this workitem's
- * data.
- *
- * @param[in] ptr                           Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per
- * workitem(in bytes)
- *
- * @return An image object
- */
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
-                                       uint stride_x, uint step_x, uint stride_y, uint step_y)
-{
-  Image img = {.ptr = ptr,
-               .offset_first_element_in_bytes = offset_first_element_in_bytes,
-               .stride_x = stride_x,
-               .stride_y = stride_y};
-  img.ptr +=
-      img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
-  return img;
-}
-
-/** Wrap 3D tensor information into an image structure, and make the pointer point at this
- * workitem's data.
- *
- * @param[in] ptr                           Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per
- * workitem(in bytes)
- * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per
- * workitem(in bytes)
- *
- * @return A 3D tensor object
- */
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
-                                                     uint offset_first_element_in_bytes,
-                                                     uint stride_x, uint step_x, uint stride_y,
-                                                     uint step_y, uint stride_z, uint step_z)
-{
-  Image img = {.ptr = ptr,
-               .offset_first_element_in_bytes = offset_first_element_in_bytes,
-               .stride_x = stride_x,
-               .stride_y = stride_y};
-  img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
-             get_global_id(1) * step_y + get_global_id(2) * step_z;
-  return img;
-}
-
-/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
- * workitem's data.
- *
- * @param[in] ptr                           Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per
- * workitem(in bytes)
- * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per
- * workitem(in bytes)
- *
- * @return A 3D tensor object
- */
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
-                                             uint offset_first_element_in_bytes, uint stride_x,
-                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
-                                             uint step_z)
-{
-  Tensor3D tensor = {.ptr = ptr,
-                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
-                     .stride_x = stride_x,
-                     .stride_y = stride_y,
-                     .stride_z = stride_z};
-  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
-                get_global_id(1) * step_y + get_global_id(2) * step_z;
-  return tensor;
-}
-
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
-                                             uint offset_first_element_in_bytes, uint stride_x,
-                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
-                                             uint step_z, uint stride_w, uint step_w, uint mod_size)
-{
-  Tensor4D tensor = {.ptr = ptr,
-                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
-                     .stride_x = stride_x,
-                     .stride_y = stride_y,
-                     .stride_z = stride_z,
-                     .stride_w = stride_w};
-
-  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
-                get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
-                (get_global_id(2) / mod_size) * step_w;
-  return tensor;
-}
-
-/** Get the pointer position of a Vector
- *
- * @param[in] vec Pointer to the starting position of the buffer
- * @param[in] x   Relative X position
- */
-inline __global const uchar *vector_offset(const Vector *vec, int x)
-{
-  return vec->ptr + x * vec->stride_x;
-}
-
-/** Get the pointer position of a Image
- *
- * @param[in] img Pointer to the starting position of the buffer
- * @param[in] x   Relative X position
- * @param[in] y   Relative Y position
- */
-inline __global uchar *offset(const Image *img, int x, int y)
-{
-  return img->ptr + x * img->stride_x + y * img->stride_y;
-}
-
-/** Get the pointer position of a Tensor3D
- *
- * @param[in] tensor Pointer to the starting position of the buffer
- * @param[in] x      Relative X position
- * @param[in] y      Relative Y position
- * @param[in] z      Relative Z position
- */
-inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
-{
-  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
-}
-
-/** Get the pointer position of a Tensor4D
- *
- * @param[in] tensor Pointer to the starting position of the buffer
- * @param[in] x      Relative X position
- * @param[in] y      Relative Y position
- * @param[in] z      Relative Z position
- * @param[in] w      Relative W position
- */
-inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
-{
-  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
-         w * tensor->stride_w;
-}
-
-#endif // _HELPER_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
deleted file mode 100644
index 5f1b3f902..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ /dev/null
@@ -1,578 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
-#define ARM_COMPUTE_HELPERS_ASYMM_H
-
-#include "helpers.h"
-
-/** Convert the given vector with round to nearest even rounding mode
- *
- * @param[in] x    The target to be converted
- * @param[in] type The target type
- *
- * @return The converted vector
- */
-#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
-
-/** Quantize a floating-point scalar value to 8-bit asymmetric
- *
- * @param[in] input  Input value to quantize
- * @param[in] offset Quantization offset
- * @param[in] scale  Quantization scale
- *
- * @return quantized value
- */
-inline uchar quantize_qasymm8(float input, float offset, float scale)
-{
-  float out_f32 = input / scale + offset;
-  uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
-  return res_u8;
-}
-
-/** Dequantize a scalar value from 8-bit asymmetric to floating-point
- *
- * @param[in] input  Input value to quantize
- * @param[in] offset Quantization offset
- * @param[in] scale  Quantization scale
- *
- * @return quantized value
- */
-inline float dequantize_qasymm8(uchar input, float offset, float scale)
-{
-  return ((float)input - offset) * scale;
-}
-
-/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
- *
- * @param[in] input  Input value to quantize
- * @param[in] offset Quantization offset
- * @param[in] scale  Quantization scale
- *
- * @return quantized value
- */
-inline float dequantize_qasymm8_signed(char input, float offset, float scale)
-{
-  return ((float)input - offset) * scale;
-}
-
-/** Quantize a vector of values from floating-point
- *
- * @param[in] type Output data type.
- * @param[in] size Size of vector.
- *
- * @return quantized values
- */
-#define QUANTIZE_IMPL(type, size)                                                                 \
-  inline VEC_DATA_TYPE(type, size)                                                                \
-      quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)          \
-  {                                                                                               \
-    VEC_DATA_TYPE(float, size)                                                                    \
-    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
-    VEC_DATA_TYPE(type, size)                                                                     \
-    res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)),                        \
-                      VEC_DATA_TYPE(type, size));                                                 \
-    return res;                                                                                   \
-  }
-
-/** Dequantize a vector of values to floating-point
- *
- * @param[in] type Input data type.
- * @param[in] size Size of vector.
- *
- * @return dequantized values in floating point
- */
-#define DEQUANTIZE_IMPL(type, size)                                                       \
-  inline VEC_DATA_TYPE(float, size)                                                       \
-      dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
-  {                                                                                       \
-    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                 \
-  }
-
-/** Correctly-rounded-to-nearest division by a power-of-two.
- *
- * @param[in] size Size of vector.
- *
- * @return Correctly-rounded-to-nearest division by a power-of-two.
- */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                        \
-  inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
-      VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
-  {                                                                     \
-    const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;  \
-    const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1;   \
-    VEC_DATA_TYPE(int, size)                                            \
-    mask = (one << exponent) - one;                                     \
-    VEC_DATA_TYPE(int, size)                                            \
-    threshold = (mask >> 1) + select(zero, one, x < 0);                 \
-    return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
-  }
-
-/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
- * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
- *
- * @param[in] size Size of vector.
- *
- * @return Product of two fixed-point numbers.
- */
-#define ASYMM_MULT_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size)                                              \
-      asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-  {                                                                            \
-    VEC_DATA_TYPE(int, size)                                                   \
-    overflow = a == b && a == INT_MIN;                                         \
-    VEC_DATA_TYPE(long, size)                                                  \
-    a_64 = convert_long##size(a);                                              \
-    VEC_DATA_TYPE(long, size)                                                  \
-    b_64 = convert_long##size(b);                                              \
-    VEC_DATA_TYPE(long, size)                                                  \
-    ab_64 = a_64 * b_64;                                                       \
-    /* Revert COMPMID-907 */                                                   \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask1 = 1 << 30;                                                           \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask2 = 1 - (1 << 30);                                                     \
-    VEC_DATA_TYPE(long, size)                                                  \
-    is_positive_or_zero = ab_64 >= 0;                                          \
-    VEC_DATA_TYPE(long, size)                                                  \
-    nudge = select(mask2, mask1, is_positive_or_zero);                         \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask = 1ll << 31;                                                          \
-    VEC_DATA_TYPE(int, size)                                                   \
-    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                  \
-    return select(ab_x2_high32, INT_MAX, overflow);                            \
-  }
-
-/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                   \
-  inline VEC_DATA_TYPE(int, size)                                                                  \
-      asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
-                                                                              a)                   \
-  {                                                                                                \
-    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                     \
-    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                  \
-    const int k_fractional_bits = 31;                                                              \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x = a + (1 << (k_fractional_bits - 3));                                                        \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x2 = ASYMM_MULT(x, x, size);                                                                   \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x3 = ASYMM_MULT(x2, x, size);                                                                  \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4 = ASYMM_MULT(x2, x2, size);                                                                 \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                        \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_24_plus_x3_over_6_plus_x2 =                                                            \
-        ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                     \
-        ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
-    return constant_term +                                                                         \
-           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);          \
-  }
-
-/** Each bit of the result is set to the corresponding bit of either then_val or
- * else_val depending on whether the corresponding bit of if_mask is set.
- * Equivalent to the VBSL instruction in ARM NEON.
- *
- * @param[in] size Size of vector.
- *
- * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding
- * bit in @p if_mask is set or not.
- */
-#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                         \
-  inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask,  \
-                                                                VEC_DATA_TYPE(int, size) then_val, \
-                                                                VEC_DATA_TYPE(int, size) else_val) \
-  {                                                                                                \
-    return (if_mask & then_val) ^ (~if_mask & else_val);                                           \
-  }
-
-/** For each element of input vector, the corresponding bits of the result item are set
- * if the input item is zero.
- *
- * @param[in] size Size of vector.
- *
- * @returns Output vector with bits set when corresponding bit in @p a is zero.
- */
-#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
-  {                                                                                    \
-    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                      \
-    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                      \
-    return select(all_zeros, all_ones, a == 0);                                        \
-  }
-
-/** For each element of input vector, the corresponding bits of the result item are set
- * if the input item is non-zero.
- *
- * @param[in] size Size of vector.
- *
- * @returns Output vector with bits set when corresponding bit in @p a is non zero.
- */
-#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
-  {                                                                                        \
-    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                          \
-    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                          \
-    return select(all_zeros, all_ones, a != 0);                                            \
-  }
-
-#define EXP_BARREL_SHIFTER_IMPL(size)                                                          \
-  inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(                                    \
-      VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,    \
-      int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                               \
-  {                                                                                            \
-    if (k_integer_bits > exponent)                                                             \
-    {                                                                                          \
-      const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
-      return ASYMM_SELECT_USING_MASK(                                                          \
-          ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                     \
-          ASYMM_MULT(result, fp_multiplier, size), result, size);                              \
-    }                                                                                          \
-                                                                                               \
-    return result;                                                                             \
-  }
-
-/** Calculates \f$ exp(x) \f$ for x < 0.
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                   \
-  inline VEC_DATA_TYPE(int, size)                                                                 \
-      asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)          \
-  {                                                                                               \
-    const int k_fractional_bits = 31 - k_integer_bits;                                            \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    k_one_quarter = 1 << (k_fractional_bits - 2);                                                 \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    mask = k_one_quarter - 1;                                                                     \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                 \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;   \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(                       \
-        a_mod_quarter_minus_one_quarter_scaled, size);                                            \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    remainder = a_mod_quarter_minus_one_quarter - a;                                              \
-                                                                                                  \
-    result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits,        \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits,        \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits,         \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits,         \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits,          \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
-                                size);                                                            \
-    result =                                                                                      \
-        EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);  \
-                                                                                                  \
-    if (k_integer_bits > 5)                                                                       \
-    {                                                                                             \
-      const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                     \
-      result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
-    }                                                                                             \
-                                                                                                  \
-    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                              \
-    return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);            \
-  }
-
-/** Calculates the product of a integer value by a power of two, with either a positive exponent
- * (equivalent to an arithmetic left shift, saturating) or a negative exponent
- * (equivalent to an arithmetic right shift, rounding to nearest).
- *
- * @param[in] size Size of vector.
- *
- * @return Arithmetic left or right shift.
- */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                    \
-  inline VEC_DATA_TYPE(int, size)                                                            \
-      asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-  {                                                                                          \
-    if (exponent < 0)                                                                        \
-    {                                                                                        \
-      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                              \
-    }                                                                                        \
-                                                                                             \
-    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                            \
-    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                            \
-    int threshold = ((1 << (31 - exponent)) - 1);                                            \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                             \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                            \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    result = x << exponent;                                                                  \
-    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                      \
-    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                      \
-    return result;                                                                           \
-  }
-
-/** Calculates (a+b)/2, rounded to the nearest integer.
- * Equivalent to VRHADD in the ARM NEON instruction set.
- *
- * @param[in] size Size of vector.
- *
- * @return (a+b)/2, rounded to the nearest integer.
- */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size)                                                           \
-      asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-  {                                                                                         \
-    VEC_DATA_TYPE(long, size)                                                               \
-    a64 = convert_long##size(a);                                                            \
-    VEC_DATA_TYPE(long, size)                                                               \
-    b64 = convert_long##size(b);                                                            \
-    VEC_DATA_TYPE(long, size)                                                               \
-    sum = a64 + b64;                                                                        \
-    const VEC_DATA_TYPE(long, size) one = 1;                                                \
-    const VEC_DATA_TYPE(long, size) minus_one = -1;                                         \
-    VEC_DATA_TYPE(long, size)                                                               \
-    sign = select(minus_one, one, sum >= 0);                                                \
-    return convert_int##size((sum + sign) / 2);                                             \
-  }
-
-/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                      \
-  inline VEC_DATA_TYPE(int, size)                                              \
-      asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
-  {                                                                            \
-    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                           \
-    const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                     \
-    VEC_DATA_TYPE(int, size)                                                   \
-    half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);               \
-    const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810;                 \
-    const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;            \
-    VEC_DATA_TYPE(int, size)                                                   \
-    x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
-    for (int i = 0; i < 3; i++)                                                \
-    {                                                                          \
-      VEC_DATA_TYPE(int, size)                                                 \
-      half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);        \
-      VEC_DATA_TYPE(int, size)                                                 \
-      one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;  \
-      VEC_DATA_TYPE(int, size)                                                 \
-      tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);           \
-      x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);            \
-    }                                                                          \
-    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                 \
-  }
-
-/** Considering the integer value as fixed-point, change the number of integer bits and update value
- * accordingly.
- *
- * @param[in] size Size of vector.
- *
- * @return Rescaled value.
- */
-#define ASYMM_RESCALE_IMPL(size)                                                                  \
-  inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value,             \
-                                                      int src_integer_bits, int dst_integer_bits) \
-  {                                                                                               \
-    int exponent = src_integer_bits - dst_integer_bits;                                           \
-    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
-  }
-
-#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
-#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
-#define DEQUANTIZE_STR(input, offset, scale, type, size) \
-  dequantize_##type##size(input, offset, scale)
-#define DEQUANTIZE(input, offset, scale, type, size) \
-  DEQUANTIZE_STR(input, offset, scale, type, size)
-
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
-  asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
-  ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
-  ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
-  asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
-  asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
-#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
-                           remainder, size)                                                    \
-  exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
-                           remainder)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \
-  asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \
-  asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
-  asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
-#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
-  asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                               \
-  inline VEC_DATA_TYPE(int, size)                                                                 \
-      multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
-  {                                                                                               \
-    const int left_shift = shift > 0 ? shift : 0;                                                 \
-    const int right_shift = shift > 0 ? 0 : -shift;                                               \
-    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),       \
-                                         right_shift, size);                                      \
-  }
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
-  multiply_by_quantized_multiplier##size(input, qmul, shift)
-
-QUANTIZE_IMPL(uchar, 1)
-QUANTIZE_IMPL(char, 1)
-QUANTIZE_IMPL(uint, 1)
-QUANTIZE_IMPL(int, 1)
-QUANTIZE_IMPL(uchar, 4)
-QUANTIZE_IMPL(ushort, 4)
-QUANTIZE_IMPL(short, 4)
-QUANTIZE_IMPL(uchar, 16)
-QUANTIZE_IMPL(char, 16)
-QUANTIZE_IMPL(ushort, 16)
-QUANTIZE_IMPL(short, 16)
-QUANTIZE_IMPL(uint, 16)
-QUANTIZE_IMPL(int, 16)
-
-DEQUANTIZE_IMPL(uchar, 1)
-DEQUANTIZE_IMPL(char, 1)
-DEQUANTIZE_IMPL(uint, 1)
-DEQUANTIZE_IMPL(int, 1)
-DEQUANTIZE_IMPL(uchar, 4)
-DEQUANTIZE_IMPL(ushort, 4)
-DEQUANTIZE_IMPL(short, 4)
-DEQUANTIZE_IMPL(uchar, 16)
-DEQUANTIZE_IMPL(char, 16)
-DEQUANTIZE_IMPL(ushort, 16)
-DEQUANTIZE_IMPL(short, 16)
-DEQUANTIZE_IMPL(uint, 16)
-DEQUANTIZE_IMPL(int, 16)
-
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
-
-ASYMM_MULT_IMPL(1)
-ASYMM_MULT_IMPL(2)
-ASYMM_MULT_IMPL(4)
-ASYMM_MULT_IMPL(8)
-ASYMM_MULT_IMPL(16)
-
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
-
-ASYMM_SELECT_USING_MASK_IMPL(1)
-ASYMM_SELECT_USING_MASK_IMPL(2)
-ASYMM_SELECT_USING_MASK_IMPL(4)
-ASYMM_SELECT_USING_MASK_IMPL(8)
-ASYMM_SELECT_USING_MASK_IMPL(16)
-
-ASYMM_MASK_IF_ZERO_IMPL(1)
-ASYMM_MASK_IF_ZERO_IMPL(2)
-ASYMM_MASK_IF_ZERO_IMPL(4)
-ASYMM_MASK_IF_ZERO_IMPL(8)
-ASYMM_MASK_IF_ZERO_IMPL(16)
-
-ASYMM_MASK_IF_NON_ZERO_IMPL(1)
-ASYMM_MASK_IF_NON_ZERO_IMPL(2)
-ASYMM_MASK_IF_NON_ZERO_IMPL(4)
-ASYMM_MASK_IF_NON_ZERO_IMPL(8)
-ASYMM_MASK_IF_NON_ZERO_IMPL(16)
-
-EXP_BARREL_SHIFTER_IMPL(2)
-EXP_BARREL_SHIFTER_IMPL(4)
-EXP_BARREL_SHIFTER_IMPL(8)
-EXP_BARREL_SHIFTER_IMPL(16)
-
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
-
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
-
-ASYMM_ROUNDING_HALF_SUM_IMPL(2)
-ASYMM_ROUNDING_HALF_SUM_IMPL(4)
-ASYMM_ROUNDING_HALF_SUM_IMPL(8)
-ASYMM_ROUNDING_HALF_SUM_IMPL(16)
-
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
-
-ASYMM_RESCALE_IMPL(1)
-ASYMM_RESCALE_IMPL(2)
-ASYMM_RESCALE_IMPL(4)
-ASYMM_RESCALE_IMPL(8)
-ASYMM_RESCALE_IMPL(16)
-
-MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
-MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
-MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
-MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
-MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
-
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
deleted file mode 100644
index 014842680..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
-    defined(DIM_Y) && defined(DIM_Z)
-/** This function normalizes the input 2D tensor across the first dimension with respect to mean and
- * standard deviation of the same dimension.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g.
- * -DDATA_TYPE=float
- * @attention Normalization epsilon parameter should be given as a preprocessor argument with
- * -DEPSILON=value. e.g. -DEPSILON=0.001f
- * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value,
- * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported
- * data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension
- * (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension
- * (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension
- * (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first
- * source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor.
- * Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X
- * dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements
- * along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y
- * dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements
- * along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z
- * dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements
- * along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in
- * the destination tensor
- * @param[in]  gamma_ptr                            (Optional) Pointer to the gamma tensor.
- * Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       (Optional) Stride of the gamma tensor in X
- * dimension (in bytes)
- * @param[in]  gamma_step_x                         (Optional) output_stride_x * number of elements
- * along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  (Optional) The offset of the first element in
- * the gamma tensor
- * @param[in]  beta_ptr                             (Optional) Pointer to the beta tensor. Supported
- * data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        (Optional) Stride of the beta tensor in X
- * dimension (in bytes)
- * @param[in]  beta_step_x                          (Optional) output_stride_x * number of elements
- * along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   (Optional) The offset of the first element in
- * the beta tensor
- */
-__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
-#ifndef IN_PLACE
-                                        TENSOR4D_DECLARATION(output)
-#endif /* IN_PLACE */
-#ifdef GAMMA
-                                            ,
-                                        VECTOR_DECLARATION(gamma)
-#endif // GAMMA
-#ifdef BETA
-                                            ,
-                                        VECTOR_DECLARATION(beta)
-#endif // BETA
-                                            )
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-#ifndef IN_PLACE
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-#endif /* IN_PLACE */
-
-  float sum = 0.f;
-  float sum_sq = 0.f;
-
-#if defined(NHWC)
-
-  const int ch = get_global_id(0);    // Current channel
-  const int batch = get_global_id(2); // Current batch
-  const int elements_plane = DIM_Y * DIM_Z;
-
-  for (int i_w = 0; i_w < DIM_Y; ++i_w)
-  {
-    for (int i_h = 0; i_h < DIM_Z; ++i_h)
-    {
-      float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
-      sum += data;
-      sum_sq += data * data;
-    }
-  }
-
-#else // !defined(NHWC)
-  const int ch = get_global_id(2) % DIM_Z;    // Current channel
-  const int batch = get_global_id(2) / DIM_Z; // Current batch
-  const int elements_plane = DIM_X * DIM_Y;
-
-  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-  part_sum = 0.f;
-  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-  part_sum_sq = 0.f;
-  // Calculate partial sum
-  for (int y = 0; y < DIM_Y; ++y)
-  {
-    int x = 0;
-    for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
-    {
-      // Load data
-      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-      data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
-      part_sum += data;
-      part_sum_sq += data * data;
-    }
-    // Left-overs loop
-    for (; x < DIM_X; ++x)
-    {
-      DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
-      part_sum.s0 += data;
-      part_sum_sq.s0 += data * data;
-    }
-  }
-// Perform reduction
-#if VEC_SIZE > 8
-  part_sum.s01234567 += part_sum.s89abcdef;
-  part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
-#endif // VEC_SIZE > 8
-#if VEC_SIZE > 4
-  part_sum.s0123 += part_sum.s4567;
-  part_sum_sq.s0123 += part_sum_sq.s4567;
-#endif // VEC_SIZE > 4
-#if VEC_SIZE > 2
-  part_sum.s01 += part_sum.s23;
-  part_sum_sq.s01 += part_sum_sq.s23;
-#endif // VEC_SIZE > 2
-  part_sum.s0 += part_sum.s1;
-  part_sum_sq.s0 += part_sum_sq.s1;
-
-  sum = (float)part_sum.s0;
-  sum_sq = (float)part_sum_sq.s0;
-
-#endif // defined(NHWC)
-
-  const float mean_float = (sum / elements_plane);
-  const DATA_TYPE mean = (DATA_TYPE)mean_float;
-  const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float);
-#if defined(GAMMA)
-  const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON);
-  const DATA_TYPE multip = (DATA_TYPE)multip_float;
-#else  // !defined(GAMMA)
-  const DATA_TYPE multip = (DATA_TYPE)0;
-#endif // defined(GAMMA)
-#if defined(BETA)
-  const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch);
-#else  // !defined(BETA)
-  const DATA_TYPE beta = 0;
-#endif // defined(BETA)
-
-#if defined(NHWC)
-
-  for (int i_w = 0; i_w < DIM_Y; ++i_w)
-  {
-    for (int i_h = 0; i_h < DIM_Z; ++i_h)
-    {
-      __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
-#ifdef IN_PLACE
-      __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-      __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
-#endif /* IN_PLACE */
-      *(output_address) = (*(input_address)-mean) * multip + beta;
-    }
-  }
-
-#else // !defined(NHWC)
-  for (int y = 0; y < DIM_Y; ++y)
-  {
-    int x = 0;
-    for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
-    {
-      __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
-#ifdef IN_PLACE
-      __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-      __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
-#endif /* IN_PLACE */
-
-      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-      data = VLOAD(VEC_SIZE)(0, input_address);
-
-      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-      res = (data - mean) * multip + beta;
-      VSTORE(VEC_SIZE)
-      (res, 0, output_address);
-    }
-    // Left-overs loop
-    for (; x < DIM_X; ++x)
-    {
-      __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
-#ifdef IN_PLACE
-      __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-      __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
-#endif /* IN_PLACE */
-      *(output_address) = (*(input_address)-mean) * multip + beta;
-    }
-  }
-#endif // defined(NHWC)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
-          defined(DIM_Y) && defined(DIM_Z) */
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
deleted file mode 100644
index 3943fc4c2..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
-/** This performs to multiply input by scale_factor.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
- * -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data
- * types: S8
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in
- * bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- * tensor
- * @param[in]  scale_ptr                            Pointer to the source tensor. Supported data
- * types: S32
- * @param[in]  scale_stride_x                       Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in]  scale_step_x                         scale_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  scale_offset_first_element_in_bytes  The offset of the first element in the scale
- * tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported
- * data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
- * (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
- * (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- * destination tensor
- */
-__kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale),
-                                    IMAGE_DECLARATION(output), float multiplier)
-{
-  // Get pixels pointer
-  Image input = CONVERT_TO_IMAGE_STRUCT(input);
-  Image output = CONVERT_TO_IMAGE_STRUCT(output);
-
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-  // Check if access on width gets out of bounds
-  // If it does shift access vector to access elements within bounds
-  const int xi = (int)(get_global_id(0) * VEC_SIZE);
-  input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-  output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-  // Load data
-  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-  val = CONVERT(VLOAD(VEC_SIZE)(0, (__global int *)input.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-
-  // Create scale vector
-  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-  vscale = *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1));
-
-  // Dequantize
-  vscale *= (DATA_TYPE)(multiplier);
-  val *= vscale;
-
-  // Store result
-  VSTORE(VEC_SIZE)
-  (val, 0, (__global DATA_TYPE *)output.ptr);
-#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
-  *((__global DATA_TYPE *)(output.ptr)) =
-      ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
-      *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-}
-
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
deleted file mode 100644
index 15c16f80c..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Performs a negation of input tensor.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types:
- *                                               S16/S32/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in
- *                                               bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed
- *                                               per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data
- *                                               types: same as @p input_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
- *                                               bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
- *                                               per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination
- * image
- *
- */
-__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VSTORE(VEC_SIZE)
-  (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
deleted file mode 100644
index c274aba62..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z)
-
-/** Performs the OneHot operation along the chosen axis
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
- * -DDATA_TYPE=short
- * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
- * @attention Output tensor depth should be given as a preprocessor argument using
- * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
- * @attention Input tensor depth should be given as a preprocessor argument using
- * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
- *
- *
- * @param[in]  indices_ptr                              Pointer to the source tensor. Supported data
- * types: S32
- * @param[in]  indices_stride_x                         Stride of the source tensor in X dimension
- * (in bytes)
- * @param[in]  indices_step_x                           indices_stride_x * number of elements along
- * X processed per work item (in bytes)
- * @param[in]  indices_stride_y                         Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in]  indices_step_y                           indices_stride_y * number of elements along
- * Y processed per work item (in bytes)
- * @param[in]  indices_stride_z                         Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in]  indices_step_z                           indices_stride_z * number of elements along
- * Z processed per work item (in bytes)
- * @param[in]  indices_offset_first_element_in_bytes    Offset of the first element in the source
- * tensor
- * @param[in]  on_value_ptr                             Pointer to the on_value vector. Supported
- * data types: U8/S8/U16/S16/F16/U32/S32/F32.
- * @param[in]  on_value_stride_x                        Stride of the on_value vector in X dimension
- * (in bytes)
- * @param[in]  on_value_step_x                          on_value_stride_x * number of elements along
- * X processed per work item (in bytes)
- * @param[in]  on_value_offset_first_element_in_bytes   Offset of the first element in the on_value
- * vector
- * @param[in]  off_value_ptr                            Pointer to the off_value vector. Supported
- * data types: Same as @p on_value.
- * @param[in]  off_value_stride_x                       Stride of the off_value vector in X
- * dimension (in bytes)
- * @param[in]  off_value_step_x                         off_value_stride_x * number of elements
- * along X processed per work item (in bytes)
- * @param[in]  off_value_offset_first_element_in_bytes  Offset of the first element in the off_value
- * vector
- * @param[out] output_ptr                               Pointer to the destination tensor. Supported
- * data types: same as @p on_value
- * @param[in]  output_stride_x                          Stride of the destination tensor in X
- * dimension (in bytes)
- * @param[in]  output_step_x                            output_stride_x * number of elements along X
- * processed per work item (in bytes)
- * @param[in]  output_stride_y                          Stride of the destination tensor in Y
- * dimension (in bytes)
- * @param[in]  output_step_y                            output_stride_y * number of elements along Y
- * processed per work item (in bytes)
- * @param[in]  output_stride_z                          Stride of the destination tensor in Z
- * dimension (in bytes)
- * @param[in]  output_step_z                            output_stride_z * number of elements along Z
- * processed per work item (in bytes)
- * @param[in]  output_stride_w                          Stride of the destination tensor in W
- * dimension (in bytes)
- * @param[in]  output_step_w                            output_stride_w * number of elements along W
- * processed per work item (in bytes)
- * @param[in]  output_offset_first_element_in_bytes     Offset of the first element in the
- * destination tensor
- */
-__kernel void one_hot(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value),
-                      VECTOR_DECLARATION(off_value), TENSOR4D_DECLARATION(output))
-{
-  const int px = get_global_id(0);
-  const int py = get_global_id(1);
-  const int pz = get_global_id(2) % OUTPUT_DIM_Z;
-  const int pw = get_global_id(2) / OUTPUT_DIM_Z;
-
-  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
-  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
-
-#if AXIS == 0
-  const int index = *(__global const int *)tensor3D_offset(&indices, py, pz, pw);
-  *(__global DATA_TYPE *)output.ptr = index == px ? *((__global const DATA_TYPE *)on_value_ptr)
-                                                  : *((__global const DATA_TYPE *)off_value_ptr);
-#elif AXIS == 1
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, pz, pw);
-  *(__global DATA_TYPE *)output.ptr = index == py ? *((__global const DATA_TYPE *)on_value_ptr)
-                                                  : *((__global const DATA_TYPE *)off_value_ptr);
-#elif AXIS == 2
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pw);
-  *(__global DATA_TYPE *)output.ptr = index == pz ? *((__global const DATA_TYPE *)on_value_ptr)
-                                                  : *((__global const DATA_TYPE *)off_value_ptr);
-#elif AXIS == 3
-  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
-  *(__global DATA_TYPE *)output.ptr = index == pw ? *((__global const DATA_TYPE *)on_value_ptr)
-                                                  : *((__global const DATA_TYPE *)off_value_ptr);
-#endif // AXIS
-}
-
-/** Performs the OneHot operation along the chosen axis as off_value being zero
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
- * -DDATA_TYPE=short
- * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
- * @attention Output tensor depth should be given as a preprocessor argument using
- * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
- * @attention Input tensor depth should be given as a preprocessor argument using
- * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
- *
- *
- * @param[in]  indices_ptr                              Pointer to the source tensor. Supported data
- * types: S32
- * @param[in]  indices_stride_x                         Stride of the source tensor in X dimension
- * (in bytes)
- * @param[in]  indices_step_x                           indices_stride_x * number of elements along
- * X processed per work item (in bytes)
- * @param[in]  indices_stride_y                         Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in]  indices_step_y                           indices_stride_y * number of elements along
- * Y processed per work item (in bytes)
- * @param[in]  indices_stride_z                         Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in]  indices_step_z                           indices_stride_z * number of elements along
- * Z processed per work item (in bytes)
- * @param[in]  indices_offset_first_element_in_bytes    Offset of the first element in the source
- * tensor
- * @param[in]  on_value_ptr                             Pointer to the on_value vector. Supported
- * data types: U8/S8/U16/S16/F16/U32/S32/F32.
- * @param[in]  on_value_stride_x                        Stride of the on_value vector in X dimension
- * (in bytes)
- * @param[in]  on_value_step_x                          on_value_stride_x * number of elements along
- * X processed per work item (in bytes)
- * @param[in]  on_value_offset_first_element_in_bytes   Offset of the first element in the on_value
- * vector
- * @param[out] output_ptr                               Pointer to the destination tensor. Supported
- * data types: same as @p on_value
- * @param[in]  output_stride_x                          Stride of the destination tensor in X
- * dimension (in bytes)
- * @param[in]  output_step_x                            output_stride_x * number of elements along X
- * processed per work item (in bytes)
- * @param[in]  output_stride_y                          Stride of the destination tensor in Y
- * dimension (in bytes)
- * @param[in]  output_step_y                            output_stride_y * number of elements along Y
- * processed per work item (in bytes)
- * @param[in]  output_stride_z                          Stride of the destination tensor in Z
- * dimension (in bytes)
- * @param[in]  output_step_z                            output_stride_z * number of elements along Z
- * processed per work item (in bytes)
- * @param[in]  output_stride_w                          Stride of the destination tensor in W
- * dimension (in bytes)
- * @param[in]  output_step_w                            output_stride_w * number of elements along W
- * processed per work item (in bytes)
- * @param[in]  output_offset_first_element_in_bytes     Offset of the first element in the
- * destination tensor
- */
-__kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value),
-                                    TENSOR4D_DECLARATION(output))
-{
-  const int px = get_global_id(0);
-  const int py = get_global_id(1);
-  const int pz = get_global_id(2);
-
-  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
-  const Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, OUTPUT_DIM_Z);
-
-  const int index = *(__global const int *)tensor3D_offset(&indices, px, py, pz);
-
-  if (index < 0 || index >= DEPTH)
-    return;
-
-#if AXIS == 0
-  *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
-#elif AXIS == 1
-  *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
-#elif AXIS == 2
-  *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
-#elif AXIS == 3
-  *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) =
-      *((__global const DATA_TYPE *)on_value_ptr);
-#endif // AXIS
-}
-
-#endif // defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
deleted file mode 100644
index 76fda9041..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else /* SATURATE */
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif /* SATURATE */
-#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
-
-#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
-/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of
- *  GEMMLowp to QASYMM8
- *
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to inputs
- *  -# Multiply inputs
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using
- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and
- *            -DIN2_OFFSET
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
- *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
- *            -DRESULT_SHIFT
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types:
- *                                               U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in
- *                                               bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in
- *                                               bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in
- *                                               bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types:
- *                                               U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in
- *                                               bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in
- *                                               bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in
- *                                               bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data
- *                                               types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
- *                                               bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in
- *                                              bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in
- *                                               bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
- *                                               image
- * @param[in]  scale                             Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
-                                    TENSOR3D_DECLARATION(out), const float scale)
-{
-  // Get pixels pointer
-  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-  // Load data
-  VEC_DATA_TYPE(int, 16)
-  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
-  VEC_DATA_TYPE(int, 16)
-  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
-  // Perform multiplication of two inputs
-  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
-  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-  VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
-
-  // Multiply with a multiplier smaller than 1
-  out_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
-  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
-  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
-  // TODO: Apply min-max BOUND to support fuse with relu.
-  /*
-  #if defined(MIN_BOUND)
-      res = max(res, (uchar16)MIN_BOUND);
-  #endif // defined(MIN_BOUND)
-  #if defined(MAX_BOUND)
-      res = min(res, (uchar16)MAX_BOUND);
-  #endif // defined(MAX_BOUND)
-  */
-
-  // Store result
-  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
deleted file mode 100644
index 4ae9adb0b..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
-#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
-#define MIN_QUANT_VAL -127
-#define MAX_QUANT_VAL 127
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
-
-/** This performs the quantization of floating point inputs to 8-bit unsigned integers.
- *
- * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g.
- * -DDATA_TYPE=short
- * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type.
- * e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g.
- * -DSCALE=0.125
- * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g.
- * -DOFFSET=125
- * @note Minimum value for quantized type should be given as a preprocessor argument using
- * -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
- * @note Maximum value for quantized type should be given as a preprocessor argument using
- * -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data
- * types: F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in
- * bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- * tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported
- * data types: S8
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
- * (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
- * (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- * destination tensor
- * @param[out] scale_ptr                            Pointer to the scale tensor. Supported data
- * types: F32
- * @param[in]  scale_stride_x                       Stride of the destination tensor in X dimension
- * (in bytes)
- * @param[in]  scale_step_x                         scale_stride_x * number of elements along X
- * processed per workitem(in bytes)
- */
-__kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale),
-                                 IMAGE_DECLARATION(output))
-{
-  // Get pixels pointer
-  Image input = CONVERT_TO_IMAGE_STRUCT(input);
-  Image output = CONVERT_TO_IMAGE_STRUCT(output);
-
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-  // Check if access on width gets out of bounds
-  // If it does shift access vector to access elements within bounds
-  const int xi = (int)(get_global_id(0) * VEC_SIZE);
-  input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-  output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-  // Load data
-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-  val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-
-  // Create scale vector
-  const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale =
-      *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
-
-  // Quantize
-  VEC_DATA_TYPE(int, VEC_SIZE)
-  res = CLAMP(CONVERT_RTE_VEC(val / vscale, int, VEC_SIZE), MIN_QUANT_VAL, MAX_QUANT_VAL);
-
-  // Store result
-  VSTORE(VEC_SIZE)
-  (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
-#else  //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
-  *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(
-      CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
-                      (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
-                  int),
-      MIN_QUANT_VAL, MAX_QUANT_VAL);
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
deleted file mode 100644
index 832ac1270..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform reduce max/min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
- *       -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as
- *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- * @param[in]  axis                                 Axis through which reduction occurs
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
-                             const int axis, const int dim)
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-  int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
-  };
-
-  DATA_TYPE value =
-      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-  for (int i = 1; i < dim; ++i)
-  {
-    indices[axis] = i;
-
-#if OP_CODE == 1 // REDUCE_MAX
-    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
-                                                               indices[2], indices[3])));
-
-#elif OP_CODE == 2 // REDUCE_MIN
-    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
-                                                               indices[2], indices[3])));
-
-#else // OP NOT SUPPORTED
-    return;
-
-#endif
-  }
-
-  *((__global DATA_TYPE *)out.ptr) = value;
-}
-
-/** Perform reduce sum/mean
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
- *       -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as
- *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- * @param[in]  axis                                 Axis through which reduction occurs
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
-                              const int axis, const int dim)
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-  int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
-  };
-
-  DATA_TYPE sum_value = (DATA_TYPE)0;
-  for (int i = 0; i < dim; ++i)
-  {
-    indices[axis] = i;
-    sum_value += *(
-        (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-  }
-
-#if OP_CODE == 3 // REDUCE_SUM
-  *((__global DATA_TYPE *)out.ptr) = sum_value;
-
-#elif OP_CODE == 4 // REDUCE_MEAN
-  *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
-
-#else // OP NOT SUPPORTED
-  return;
-
-#endif
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl
deleted file mode 100644
index 3d5e90356..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(WIDTH)
-/** This function identifies the min and maximum value of an input 3D tensor.
- *
- * @note The width, height and depth of the input tensor must be provided at compile time using
- * -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
- *
- * @param[in] src_ptr                           Pointer to the source tensor. Supported data types:
- * F32
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in
- * position 0, maximum value in position 1. Supported data types: F32.
- * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in
- * bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max
- * vector
- */
-__kernel void scale_factor_symm8(IMAGE_DECLARATION(src), VECTOR_DECLARATION(dst))
-{
-  Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-  float4 min_value = (float4)FLT_MAX;
-  float4 max_value = (float4)-FLT_MAX;
-
-  int x = 0;
-  __global float *src_addr = (__global float *)(src.ptr);
-
-  for (; x <= (int)(WIDTH - 8); x += 8)
-  {
-    float8 value = vload8(0, (__global float *)(src_addr + x));
-
-    min_value = select(value.s0123, min_value, min_value < value.s0123);
-    min_value = select(value.s4567, min_value, min_value < value.s4567);
-
-    max_value = select(value.s0123, max_value, max_value > value.s0123);
-    max_value = select(value.s4567, max_value, max_value > value.s4567);
-  }
-
-  for (; x < WIDTH; ++x)
-  {
-    float value = *(src_addr + x);
-
-    min_value.s0 = min(min_value.s0, value);
-    max_value.s0 = max(max_value.s0, value);
-  }
-
-  // Perform min/max reduction
-  min_value.s01 = min(min_value.s01, min_value.s23);
-  min_value.s0 = min(min_value.s0, min_value.s1);
-  max_value.s01 = max(max_value.s01, max_value.s23);
-  max_value.s0 = max(max_value.s0, max_value.s1);
-
-  // Extract scale
-  max_value.s0 = max(fabs(min_value.s0), fabs(max_value.s0)) / 127.0f;
-
-  // Store min and max
-  *((__global float *)(dst_ptr) + get_global_id(1)) = max_value.s0;
-}
-#endif // defined(WIDTH)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
deleted file mode 100644
index 3eb1a4ce7..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf,
-                          __global int *in_ind_buf, const int n)
-{
-  int gid = get_global_id(0);
-  int lws = get_local_size(0);
-  int groups = get_num_groups(0);
-  int gws = lws * groups;
-  int iter = n / gws;
-
-  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
-
-  for (int i = 0; i < iter; ++i)
-  {
-    int idx = i * gws + gid;
-    in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x);
-    in_ind_buf[idx] = idx;
-  }
-}
-
-__kernel void topkv2_find_first_negative(__global float *out_key_buf,
-                                         __global int *first_negative_idx, int n)
-{
-  int gid = get_global_id(0);
-
-  if (gid == n - 1)
-  {
-    // if the last item is positive, the first negative index is n.
-    if (out_key_buf[gid] > 0.f)
-      *first_negative_idx = n;
-  }
-  else if (gid == 0)
-  {
-    // if the first item is negative, set it 0.
-    if (out_key_buf[gid] < 0.f)
-      *first_negative_idx = 0;
-  }
-  else
-  {
-    // if its left is positive and it is negative, then it is the first negative item.
-    if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f)
-      *first_negative_idx = gid;
-  }
-}
-
-__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf,
-                                       __global float *in_ind_buf, __global float *out_ind_buf,
-                                       __global int *first_negative_idx, int n)
-{
-  int gid = get_global_id(0);
-
-  int num_negs = n - *first_negative_idx;
-  int in_idx;
-
-  if (gid < num_negs)
-  {
-    in_idx = n - 1 - gid;
-  }
-  else
-  {
-    in_idx = gid - num_negs;
-  }
-
-  out_key_buf[gid] = in_key_buf[in_idx];
-  out_ind_buf[gid] = in_ind_buf[in_idx];
-}
-
-__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices),
-                           __global float *out_key_buf, __global int *out_ind_buf, int n)
-{
-  int gid = get_global_id(0);
-
-  Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
-  Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
-
-  int idx = n - 1 - gid;
-
-  *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
-  *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
-}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
deleted file mode 100644
index 460de790b..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-__global inline float *get_vec_elem(Vector *vec, int idx)
-{
-  return (__global float *)(vec->ptr + idx * vec->stride_x);
-}
-
-__global inline int *get_vec_elem_int(Vector *vec, int idx)
-{
-  return (__global int *)(vec->ptr + idx * vec->stride_x);
-}
-
-// A utility function to swap two elements
-void swap(__global float *a, __global float *b)
-{
-  float t = *a;
-  *a = *b;
-  *b = t;
-}
-
-void swap_idx(__global int *a, __global int *b)
-{
-  int t = *a;
-  *a = *b;
-  *b = t;
-}
-
-/* This function is same in both iterative and recursive*/
-int partition(Vector *arr, __global int *indices, int l, int h)
-{
-  float x = *get_vec_elem(arr, h);
-  int i = (l - 1);
-
-  for (int j = l; j <= h - 1; j++)
-  {
-    if (*get_vec_elem(arr, j) >= x)
-    {
-      i++;
-      swap(get_vec_elem(arr, i), get_vec_elem(arr, j));
-      swap_idx(&indices[i], &indices[j]);
-    }
-  }
-  swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
-  swap_idx(&indices[i + 1], &indices[h]);
-  return (i + 1);
-}
-
-/* A[] --> Array to be sorted,
-   l  --> Starting index,
-   h  --> Ending index */
-void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h)
-{
-  // Create an auxiliary stack
-
-  // initialize top of stack
-  int top = -1;
-
-  // push initial values of l and h to stack
-  stack[++top] = l;
-  stack[++top] = h;
-
-  // Keep popping from stack while is not empty
-  while (top >= 0)
-  {
-    // Pop h and l
-    h = stack[top--];
-    l = stack[top--];
-
-    // Set pivot element at its correct position
-    // in sorted array
-    int p = partition(arr, indices, l, h);
-
-    // If there are elements on left side of pivot,
-    // then push left side to stack
-    if (p - 1 > l)
-    {
-      stack[++top] = l;
-      stack[++top] = p - 1;
-    }
-
-    // If there are elements on right side of pivot,
-    // then push right side to stack
-    if (p + 1 < h)
-    {
-      stack[++top] = p + 1;
-      stack[++top] = h;
-    }
-  }
-}
-
-__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values),
-                               VECTOR_DECLARATION(topk_indices), __global int *indices,
-                               __global int *temp_stack, int k, int n)
-{
-  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
-  Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
-  Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
-
-  for (int i = 0; i < n; ++i)
-  {
-    indices[i] = i;
-  }
-
-  quickSortIterative(&input, indices, temp_stack, 0, n - 1);
-
-  // extract k items.
-  for (int i = 0; i < k; ++i)
-  {
-    *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i);
-    *get_vec_elem_int(&topk_indices, i) = indices[i];
-  }
-}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
deleted file mode 100644
index e9d4696b4..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// reference:
-// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
-// OpenCL kernel sources for the CLRadixSort class
-// the #include does not exist in OpenCL
-// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
-// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
-// if you find this software usefull you can cite the following work in your reports or articles:
-// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
-// http://hal.archives-ouvertes.fr/hal-00596730
-
-// Reference for floating point radix sort:
-// http://www.codercorner.com/RadixSortRevisited.htm
-
-// compute the histogram for each radix and each virtual processor for the pass
-__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
-                                  const int pass, __local int *loc_histo, const int n)
-{
-  int it = get_local_id(0);  // i local number of the processor
-  int ig = get_global_id(0); // global number = i + g I
-
-  int gr = get_group_id(0); // g group number
-
-  int groups = get_num_groups(0);
-  int items = get_local_size(0);
-
-  // set the local histograms to zero
-  for (int ir = 0; ir < _RADIX; ir++)
-  {
-    loc_histo[ir * items + it] = 0;
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // range of keys that are analyzed by the work item
-  int size = n / groups / items; // size of the sub-list
-  int start = ig * size;         // beginning of the sub-list
-
-  unsigned int key;
-  int shortkey, k;
-
-  // compute the index
-  // the computation depends on the transposition
-  for (int j = 0; j < size; j++)
-  {
-#ifdef TRANSPOSE
-    k = groups * items * j + ig;
-#else
-    k = j + start;
-#endif
-
-    key = *((__global unsigned int *)(in_key_buf + k));
-
-    // extract the group of _BITS bits of the pass
-    // the result is in the range 0.._RADIX-1
-    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
-
-    // increment the local histogram
-    loc_histo[shortkey * items + it]++;
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // copy the local histogram to the global one
-  for (int ir = 0; ir < _RADIX; ir++)
-  {
-    d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
-  }
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// initial transpose of the list for improving
-// coalescent memory access
-__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
-                        const int nbrow, const __global int *inperm, __global int *outperm,
-                        __local int *blockmat, __local int *blockperm, const int tilesize)
-{
-
-  int i0 = get_global_id(0) * tilesize; // first row index
-  int j = get_global_id(1);             // column index
-
-  int jloc = get_local_id(1); // local column index
-
-  // fill the cache
-  for (int iloc = 0; iloc < tilesize; iloc++)
-  {
-    int k = (i0 + iloc) * nbcol + j; // position in the matrix
-    blockmat[iloc * tilesize + jloc] = invect[k];
-#ifdef PERMUT
-    blockperm[iloc * tilesize + jloc] = inperm[k];
-#endif
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // first row index in the transpose
-  int j0 = get_group_id(1) * tilesize;
-
-  // put the cache at the good place
-  for (int iloc = 0; iloc < tilesize; iloc++)
-  {
-    int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
-    outvect[kt] = blockmat[jloc * tilesize + iloc];
-#ifdef PERMUT
-    outperm[kt] = blockperm[jloc * tilesize + iloc];
-#endif
-  }
-}
-
-// each virtual processor reorders its data using the scanned histogram
-__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
-                                __global int *d_Histograms, const int pass,
-                                __global int *indices_in, __global int *indices_out,
-                                __local int *loc_histo, const int n)
-{
-
-  int it = get_local_id(0);
-  int ig = get_global_id(0);
-
-  int gr = get_group_id(0);
-  int groups = get_num_groups(0);
-  int items = get_local_size(0);
-
-  int start = ig * (n / groups / items);
-  int size = n / groups / items;
-
-  // take the histogram in the cache
-  for (int ir = 0; ir < _RADIX; ir++)
-  {
-    loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
-  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  int newpos, shortkey, k, newpost;
-  unsigned int key;
-
-  for (int j = 0; j < size; j++)
-  {
-#ifdef TRANSPOSE
-    k = groups * items * j + ig;
-#else
-    k = j + start;
-#endif
-    float org_value = in_key[k];
-    key = *(__global unsigned int *)(in_key + k);
-    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
-
-    newpos = loc_histo[shortkey * items + it];
-
-#ifdef TRANSPOSE
-    int ignew, jnew;
-    ignew = newpos / (n / groups / items);
-    jnew = newpos % (n / groups / items);
-    newpost = jnew * (groups * items) + ignew;
-#else
-    newpost = newpos;
-#endif
-
-    // d_outKeys[newpost]= key;  // killing line !!!
-    out_key[newpost] = org_value;
-
-#ifdef PERMUT
-    indices_out[newpost] = indices_in[k];
-#endif
-
-    newpos++;
-    loc_histo[shortkey * items + it] = newpos;
-  }
-}
-
-// perform a parallel prefix sum (a scan) on the local histograms
-// (see Blelloch 1990) each workitem worries about two memories
-// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
-__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
-                                       __global int *globsum)
-{
-  int it = get_local_id(0);
-  int ig = get_global_id(0);
-  int decale = 1;
-  int n = get_local_size(0) * 2;
-  int gr = get_group_id(0);
-
-  // load input into local memory
-  // up sweep phase
-  temp[2 * it] = histo[2 * ig];
-  temp[2 * it + 1] = histo[2 * ig + 1];
-
-  // parallel prefix sum (algorithm of Blelloch 1990)
-  for (int d = n >> 1; d > 0; d >>= 1)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (it < d)
-    {
-      int ai = decale * (2 * it + 1) - 1;
-      int bi = decale * (2 * it + 2) - 1;
-      temp[bi] += temp[ai];
-    }
-    decale *= 2;
-  }
-
-  // store the last element in the global sum vector
-  // (maybe used in the next step for constructing the global scan)
-  // clear the last element
-  if (it == 0)
-  {
-    globsum[gr] = temp[n - 1];
-    temp[n - 1] = 0;
-  }
-
-  // down sweep phase
-  for (int d = 1; d < n; d *= 2)
-  {
-    decale >>= 1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (it < d)
-    {
-      int ai = decale * (2 * it + 1) - 1;
-      int bi = decale * (2 * it + 2) - 1;
-
-      int t = temp[ai];
-      temp[ai] = temp[bi];
-      temp[bi] += t;
-    }
-  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // write results to device memory
-
-  histo[2 * ig] = temp[2 * it];
-  histo[2 * ig + 1] = temp[2 * it + 1];
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// use the global sum for updating the local histograms
-// each work item updates two values
-__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
-{
-  int ig = get_global_id(0);
-  int gr = get_group_id(0);
-
-  int s;
-
-  s = globsum[gr];
-
-  // write results to device memory
-  histo[2 * ig] += s;
-  histo[2 * ig + 1] += s;
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
deleted file mode 100644
index 047004d5e..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int vector_size = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output,
-                          const ITensorInfo *output, unsigned int axis, ReductionOperation op)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
-                                      op != ReductionOperation::ARG_IDX_MIN,
-                                  "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
-                                  "Reduction axis greater than max number of dimensions");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32,
-                                                         DataType::S64);
-  }
-  if (prev_output != nullptr && prev_output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32,
-                                                         DataType::S32, DataType::S64);
-    if (output->total_size() != 0)
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
-    }
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
-                                                         ITensorInfo *prev_output,
-                                                         ITensorInfo *output, unsigned int axis,
-                                                         ReductionOperation op)
-{
-  ARM_COMPUTE_UNUSED(op);
-  // Output tensor auto initialization if not yet initialized
-  TensorShape output_shape{input->tensor_shape()};
-  output_shape.set(axis, 1);
-  DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32;
-  auto_init_if_empty(*output, input->clone()
-                                  ->set_tensor_shape(output_shape)
-                                  .set_data_type(output_data_type)
-                                  .reset_padding()
-                                  .set_is_resizable(true));
-
-  Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input),
-                                    Steps(vector_size));
-  bool window_changed = false;
-
-  switch (axis)
-  {
-    case 0:
-    {
-      ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input;
-      AccessWindowStatic input_access(input_tensor_access, 0, 0,
-                                      static_cast<int>(input_tensor_access->dimension(0)), 1);
-      AccessWindowHorizontal output_access(output, 0, 1);
-      window_changed = update_window_and_padding(win, input_access, output_access);
-      output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    break;
-    case 1:
-    case 2:
-    case 3:
-    {
-      AccessWindowHorizontal input_access(input, 0, vector_size);
-      AccessWindowHorizontal output_access(output, 0, vector_size);
-      window_changed = update_window_and_padding(win, input_access, output_access);
-      output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    break;
-    default:
-      ARM_COMPUTE_ERROR("Not supported");
-  }
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_tuple(err, win);
-}
-} // namespace
-
-CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx()
-    : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
-      _op(ReductionOperation::ARG_IDX_MAX)
-{
-}
-
-void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output,
-                                         ICLTensor *output, unsigned int axis,
-                                         ReductionOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
-                         output->info(), axis, op));
-  auto win_config = validate_and_configure_window(
-      input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
-      op);
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  _input = input;
-  _prev_output = prev_output;
-  _output = output;
-  _reduction_axis = axis;
-  _op = op;
-
-  // Set build options
-  CLBuildOptions build_opts;
-
-  build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
-  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
-  build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
-  build_opts.add_option("-DDATA_TYPE_OUTPUT=" +
-                        get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.add_option("-DDATA_TYPE_SELECT=" +
-                        get_cl_signed_type_from_element_size(input->info()->element_size()));
-
-  // Create kernel
-  cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
-  std::string kernel_axis_name;
-  switch (axis)
-  {
-    case 0:
-    {
-      const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
-      build_opts.add_option("-DWIDTH=" +
-                            support::cpp11::to_string(input_for_width->info()->dimension(0)));
-
-      kernel_axis_name = "x";
-      lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0),
-                                                          vector_size);
-    }
-    break;
-    case 1:
-      build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-      kernel_axis_name = "y";
-      break;
-    case 2:
-      build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-      kernel_axis_name = "z";
-      break;
-    case 3:
-      build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-      build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
-      kernel_axis_name = "w";
-      break;
-    default:
-      ARM_COMPUTE_ERROR("Not supported");
-  }
-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
-      "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
-
-  // Configure kernel window
-  ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
-}
-
-Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output,
-                                          const ITensorInfo *output, unsigned int axis,
-                                          ReductionOperation op)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
-  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
-      output->clone().get(), axis, op)));
-  return Status{};
-}
-
-void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  switch (_reduction_axis)
-  {
-    case 0:
-    {
-      // Set out window
-      Window out_window(window);
-      out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-      // Get first input and output slices
-      Window in_slice = window.first_slice_window_2D();
-      Window out_slice = out_window.first_slice_window_2D();
-
-      // Reshape window
-      const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
-
-      // Set local sums buffer
-      unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
-      _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
-      do
-      {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        if (_prev_output != nullptr)
-        {
-          add_2D_tensor_argument(idx, _prev_output, in_slice);
-        }
-        add_2D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-      } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
-    }
-    break;
-    case 1:
-    {
-      // Get first input and output slices
-      Window window_in{window};
-      window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1),
-                                                    _input->info()->dimension(1)));
-      Window in_slice = window_in.first_slice_window_2D();
-      Window out_slice = window.first_slice_window_2D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-      } while (window_in.slide_window_slice_2D(in_slice) &&
-               window.slide_window_slice_2D(out_slice));
-    }
-    break;
-    case 2:
-    {
-      // Get first input and output slices
-      Window window_in{window};
-      window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2),
-                                                    _input->info()->dimension(2)));
-      Window in_slice = window_in.first_slice_window_3D();
-      Window out_slice = window.first_slice_window_3D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, in_slice);
-        add_3D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-      } while (window_in.slide_window_slice_3D(in_slice) &&
-               window.slide_window_slice_3D(out_slice));
-    }
-    break;
-    case 3:
-    {
-      // Get first input and output slices
-      Window window_in{window};
-      window_in.set(3, Window::Dimension(0, 1, 1));
-      Window in_slice = window_in.first_slice_window_4D();
-      Window out_slice = window.first_slice_window_4D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input, in_slice);
-        add_4D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-      } while (window_in.slide_window_slice_4D(in_slice) &&
-               window.slide_window_slice_4D(out_slice));
-    }
-    break;
-    default:
-      ARM_COMPUTE_ERROR("Not supported");
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
deleted file mode 100644
index fbc76f5e1..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
-                                                         DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                        ICLTensor *output, BinaryLogicalOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "binary_logical_op";
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
-
-  int op_code = 0;
-  switch (op)
-  {
-    case BinaryLogicalOperation::AND:
-      op_code = 1;
-      break;
-    case BinaryLogicalOperation::OR:
-      op_code = 2;
-      break;
-    default:
-      throw std::runtime_error("Operation not supported, yet");
-  }
-
-  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
-  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  update_window_and_padding(win_input1, input1_access) ||
-      update_window_and_padding(win_input2, input2_access) ||
-      update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLBinaryLogicalOpKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
deleted file mode 100644
index 6e0bcde7f..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input == output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
-                                                       DataType::S16, DataType::U16, DataType::U32,
-                                                       DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(),
-                                  "Input and output data types must be different");
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-} // namespace
-
-void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype
-  // must be given)
-  set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-  // Get number of elements to process per iterations
-  constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-  // Set build options
-  CLBuildOptions build_opts;
-  build_opts.add_option("-DVEC_SIZE=" +
-                        support::cpp11::to_string(num_elems_processed_per_iteration));
-  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.add_option("-DDATA_TYPE_OUT=" +
-                        get_cl_type_from_data_type(output->info()->data_type()));
-
-  // Create kernel
-  const std::string kernel_name = "cast_bool";
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
-
-  // Configure kernel
-  ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
-
-  // Collapse window
-  const Window &full_window = window();
-  Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
-  ICLKernel::configure_internal(collapsed_window);
-
-  // Set config_id for enabling LWS tuning
-  _config_id = kernel_name;
-  _config_id += "_";
-  _config_id += lower_string(string_from_data_type(output->info()->data_type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(output->info()->dimension(0));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLCastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-
-  return Status{};
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
deleted file mode 100644
index 67aaf2db6..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  input_access.set_valid_region(win, output->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
-    : _input(nullptr), _output(nullptr), _lookups(nullptr)
-{
-}
-
-Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                         const ITensorInfo *lookups)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-
-  return Status{};
-}
-
-void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                        const ICLTensor *lookups)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
-
-  _input = input;
-  _output = output;
-  _lookups = lookups;
-
-  // Set kernel build options
-  std::stringstream kernel_name;
-  std::set<std::string> build_opts;
-  kernel_name << "embedding_lookup";
-
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-}
-
-void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  Window win_lookup;
-  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_in);
-    add_1D_tensor_argument(idx, _lookups, win_lookup);
-
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
deleted file mode 100644
index 3bfe3e407..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include "arm_compute/core/UtilsEx.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
-                                 const ITensorInfo *output, int axis)
-{
-  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
-  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
-  ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-        input->tensor_shape(), indices->tensor_shape(), actual_axis);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices,
-                                                        ITensorInfo *output, int axis)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
-  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
-  std::unique_ptr<ITensorInfo> output_info = input->clone();
-  output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-      input->tensor_shape(), indices->tensor_shape(), actual_axis));
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
-
-  // Create window
-  Window win = calculate_max_window(*output, Steps());
-  output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-  return std::make_pair(Status{}, win);
-}
-
-} // namespace
-
-CLGatherExKernel::CLGatherExKernel()
-    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
-{
-}
-
-void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices,
-                                 ICLTensor *output, int axis)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), indices->info(), output->info(), axis));
-
-  // Configure kernel window
-  auto win_config =
-      validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-  _input = input;
-  _output = output;
-  _indices = indices;
-  _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions()));
-
-  // Set build options
-  CLBuildOptions build_opts;
-  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.add_option("-DOUTPUT_DIM_Z=" +
-                        support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
-  build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
-  build_opts.add_option("-DINDICES_DIM=" +
-                        support::cpp11::to_string(indices->info()->num_dimensions()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
-  ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices,
-                                  const ITensorInfo *output, int axis)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                            indices->clone().get(),
-                                                            output->clone().get(), axis)
-                                  .first);
-  return Status{};
-}
-
-void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4);
-  unsigned int idx = 0;
-  add_4D_tensor_argument(idx, _input, window_collapsed);
-  add_3D_tensor_argument(idx, _indices, window_collapsed);
-  add_4D_tensor_argument(idx, _output, window_collapsed);
-  enqueue(queue, *this, window_collapsed, lws_hint());
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
deleted file mode 100644
index 930e7c944..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  input_access.set_valid_region(win, output->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLHashtableLookupKernel::CLHashtableLookupKernel()
-{
-  // DO NOTHING
-}
-
-Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
-                                         const ITensorInfo *input, const ITensorInfo *output,
-                                         const ITensorInfo *hits)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Output's shape was not set");
-
-  ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) ||
-                       output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
-  ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
-
-  return Status{};
-}
-
-void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
-                                        const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
-
-  _lookups = lookups;
-  _keys = keys;
-  _input = input;
-  _output = output;
-  _hits = hits;
-
-  // Make _lookup_indices tensor
-  _lookup_indices = support::cpp14::make_unique<CLTensor>();
-  _lookup_indices->allocator()->init(
-      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
-  _lookup_indices->allocator()->allocate();
-
-  // Set kernel build options
-  std::stringstream kernel_name;
-  std::set<std::string> build_opts;
-  kernel_name << "hashtable_lookup";
-
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-}
-
-void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const_cast<ICLTensor *>(_lookups)->map(queue);
-  const_cast<ICLTensor *>(_keys)->map(queue);
-  _hits->map(queue);
-  _lookup_indices->map(queue);
-
-  // Set values of hits
-  const int32_t *lookups_buf =
-      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
-  const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
-  uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
-  int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
-
-  std::map<int32_t, size_t> key_map;
-  const size_t keys_num = _keys->info()->dimension(0);
-  for (size_t key_index = 0; key_index < keys_num; key_index++)
-  {
-    key_map[keys_buf[key_index]] = key_index;
-  }
-
-  const size_t lookups_num = _lookups->info()->dimension(0);
-  for (size_t i = 0; i < lookups_num; ++i)
-  {
-    const auto lookup_value = lookups_buf[i];
-    const auto it = key_map.find(lookup_value);
-    if (it != key_map.end())
-    {
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-      if (it->second >= lookups_num)
-        ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
-      lookup_indices_buf[i] = static_cast<int32_t>(it->second);
-      hits_buf[i] = static_cast<uint8_t>(1);
-    }
-    else
-    {
-      lookup_indices_buf[i] = -1;
-      hits_buf[i] = static_cast<uint8_t>(0);
-    }
-  }
-
-  const_cast<ICLTensor *>(_lookups)->unmap(queue);
-  const_cast<ICLTensor *>(_keys)->unmap(queue);
-  _hits->unmap(queue);
-  _lookup_indices->unmap(queue);
-
-  Window win = window.collapse(ICLKernel::window(), 2, 4);
-
-  Window win_lookup;
-  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, win);
-    add_4D_tensor_argument(idx, _output, win);
-    add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
-
-    enqueue(queue, *this, win);
-  } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
deleted file mode 100644
index 61c14d271..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
-{
-  ARM_COMPUTE_UNUSED(gamma);
-  ARM_COMPUTE_UNUSED(beta);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
-
-  if (output != nullptr && output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
-                                    "Input and output have different number of channels");
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  // We handle the planes manually
-  Window win = calculate_max_window(*input, Steps(1));
-
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
-
-  // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
-  // skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->num_dimensions());
-  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-  return std::make_pair(Status{}, win);
-}
-} // namespace
-
-CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx()
-    : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
-      _run_in_place(false)
-{
-}
-
-void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output,
-                                                     ICLTensor *gamma, ICLTensor *beta,
-                                                     float epsilon)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _input = input;
-  _output = output == nullptr ? input : output;
-  _gamma = gamma;
-  _beta = beta;
-  _epsilon = epsilon;
-
-  _run_in_place = (output == nullptr) || (output == input);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(),
-                                                gamma ? gamma->info() : nullptr,
-                                                beta ? beta->info() : nullptr, epsilon));
-  const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-
-  CLBuildOptions build_opts;
-  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.add_option("-DVEC_SIZE=" +
-                        support::cpp11::to_string(num_elems_processed_per_iteration));
-  build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
-  build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
-  build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
-  build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
-  build_opts.add_option_if(gamma, "-DGAMMA");
-  build_opts.add_option_if(beta, "-DBETA");
-  build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-  build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(_input->info(), _output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-  ICLKernel::configure_internal(std::get<1>(win_config));
-}
-
-Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
-                                                      const ITensorInfo *output,
-                                                      const ITensorInfo *gamma,
-                                                      const ITensorInfo *beta, float epsilon)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
-  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
-  return Status{};
-}
-
-void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  Window collapsed_window = window.collapse(window, Window::DimZ);
-
-  // We will process the planes together
-  if (_input->info()->data_layout() == DataLayout::NCHW)
-  {
-    collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
-    collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
-  }
-  else
-  {
-    collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
-    collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1));
-  }
-
-  Window vec_window;
-  vec_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  unsigned int idx = 0;
-  add_4D_tensor_argument(idx, _input, collapsed_window);
-  if (!_run_in_place)
-  {
-    add_4D_tensor_argument(idx, _output, collapsed_window);
-  }
-  if (_gamma)
-  {
-    add_1D_tensor_argument(idx, _gamma, vec_window);
-  }
-  if (_beta)
-  {
-    add_1D_tensor_argument(idx, _beta, vec_window);
-  }
-
-  enqueue(queue, *this, collapsed_window, lws_hint());
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
deleted file mode 100644
index 6b27c9917..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
-                          const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
-  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-  // Checks performed when output is configured
-  if ((output->total_size() != 0))
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *input,
-                                                         ITensorInfo *output)
-{
-  // Configure kernel window
-  Window win = calculate_max_window(*input, Steps());
-
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
-
-  // CLMultiplyScaleFactorKernel doesn't need padding so update_window_and_padding() can be
-  // skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->num_dimensions());
-  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
-  return std::make_tuple(Status{}, win);
-}
-} // namespace
-
-CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
-{
-}
-
-void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor,
-                                            ICLTensor *output, float multiplier)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
-
-  _input = input;
-  _scale_factor = scale_factor;
-  _output = output;
-  _multiplier = multiplier;
-
-  const int vec_size_x = 16 / output->info()->element_size();
-  const int output_width_x = output->info()->tensor_shape().x();
-  const bool multi_access_x = (output_width_x / vec_size_x > 0);
-
-  // Create and update the window (if needed)
-  Window win = calculate_max_window(*output->info());
-  if (multi_access_x)
-  {
-    win.set(Window::DimX,
-            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
-                              vec_size_x));
-  }
-  ICLKernel::configure_internal(win);
-
-  // Create kernel
-  CLBuildOptions build_opts;
-  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.add_option_if(
-      multi_access_x, "-DLAST_ACCESSED_X=" +
-                          support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
-}
-
-Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
-                                             const ITensorInfo *scale_factor,
-                                             const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-  return Status{};
-}
-
-void CLMultiplyScaleFactorKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = window_collapsed.first_slice_window_2D();
-
-  // Set scale_factor window
-  Window win_scale = calculate_max_window(*_scale_factor->info(), Steps());
-
-  do
-  {
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input, slice);
-    add_1D_tensor_argument(idx, _scale_factor, win_scale);
-    add_2D_tensor_argument(idx, _output, slice);
-    _kernel.setArg<float>(idx++, _multiplier);
-    enqueue(queue, *this, slice, lws_hint());
-  } while (window_collapsed.slide_window_slice_2D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
deleted file mode 100644
index 643c8b110..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLNegKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
-                                                DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
-                                                DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  return Status{};
-}
-
-} // namespace
-
-CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-  // Create kernel
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
-
-  // Configure window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice, lws_hint());
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
deleted file mode 100644
index 35d70d689..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include "support/StringSupport.h"
-#include <string>
-namespace arm_compute
-{
-namespace
-{
-inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *on_value,
-                                 const ITensorInfo *output, int depth, int axis)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, on_value, output);
-  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
-  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
-  ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(depth <= 0);
-  ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= output->num_dimensions());
-  ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8,
-                                                       DataType::U16, DataType::S16, DataType::F16,
-                                                       DataType::U32, DataType::S32, DataType::F32);
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
-        indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
-  }
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices,
-                                                        const ITensorInfo *on_value,
-                                                        ITensorInfo *output, int depth, int axis)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output, indices);
-  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
-  // Output auto initialization if not yet initialized
-  TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
-      indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
-  auto_init_if_empty((*output), output_shape, 1, on_value->data_type());
-  // Create window
-  Window win = calculate_max_window(*output, Steps());
-  output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-  return std::make_pair(Status{}, win);
-}
-} // namespace
-CLOneHotKernel::CLOneHotKernel()
-    : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
-      _is_off_value_memset(false)
-{
-}
-void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
-                               const ICLTensor *off_value, ICLTensor *output, int depth, int axis)
-{
-  _is_off_value_memset = false;
-  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, off_value, output);
-  ARM_COMPUTE_ERROR_ON_NULLPTR(off_value->info());
-  ARM_COMPUTE_ERROR_ON(off_value->info()->tensor_shape().total_size() != 1);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
-  _off_value = off_value;
-  configure_common(indices, on_value, output, depth, axis);
-}
-void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
-                               ICLTensor *output, int depth, int axis)
-{
-  _is_off_value_memset = true;
-  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output);
-  configure_common(indices, on_value, output, depth, axis);
-}
-void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor *on_value,
-                                      ICLTensor *output, int depth, int axis)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis));
-  // Configure kernel window
-  auto win_config =
-      validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  if (_is_off_value_memset)
-  {
-    // Replace window with calculated by infices info
-    win_config.second = calculate_max_window(*indices->info(), Steps());
-  }
-  _indices = indices;
-  _on_value = on_value;
-  _output = output;
-  const auto actual_axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions()));
-  // Set build options
-  CLBuildOptions build_opts;
-  build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(
-                                             data_size_from_type(on_value->info()->data_type())));
-  build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis));
-  build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
-  build_opts.add_option("-DOUTPUT_DIM_Z=" +
-                        support::cpp11::to_string(output->info()->dimension(2)));
-  // Create kernel
-  const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot";
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
-  ICLKernel::configure_internal(win_config.second);
-}
-Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
-                                const ITensorInfo *off_value, const ITensorInfo *output, int depth,
-                                int axis)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(off_value);
-  ARM_COMPUTE_RETURN_ERROR_ON(off_value->tensor_shape().total_size() != 1);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
-                                                            on_value->clone().get(),
-                                                            output->clone().get(), depth, axis)
-                                  .first);
-  return Status{};
-}
-Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
-                                const ITensorInfo *output, int depth, int axis)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
-                                                            on_value->clone().get(),
-                                                            output->clone().get(), depth, axis)
-                                  .first);
-  return Status{};
-}
-void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  unsigned int idx = 0;
-  add_3D_tensor_argument(idx, _indices, window_collapsed);
-  add_1D_tensor_argument(idx, _on_value, window_collapsed);
-  if (!_is_off_value_memset)
-  {
-    add_1D_tensor_argument(idx, _off_value, window_collapsed);
-  }
-  add_4D_tensor_argument(idx, _output, window_collapsed);
-  enqueue(queue, *this, window_collapsed, lws_hint());
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
deleted file mode 100644
index 1a7a18cfa..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
-                          const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
-  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, scale_factor);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
-
-  // Output must always be initialized
-  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  // Configure kernel window
-  Window win = calculate_max_window(*input, Steps());
-
-  const int vec_size_x = 16 / input->element_size();
-  const int input_width_x = input->tensor_shape().x();
-  const bool multi_access_x = (input_width_x / vec_size_x > 0);
-
-  if (multi_access_x)
-  {
-    win.set(Window::DimX,
-            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
-                              vec_size_x));
-  }
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->num_dimensions());
-  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
-  return std::make_pair(Status{}, win);
-}
-} // namespace
-
-CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
-{
-}
-
-void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor,
-                                              ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
-
-  _input = input;
-  _scale_factor = scale_factor;
-  _output = output;
-
-  const int vec_size_x = 16 / input->info()->element_size();
-  const int input_width_x = input->info()->tensor_shape().x();
-  const bool multi_access_x = (input_width_x / vec_size_x > 0);
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-
-  // Create kernel
-  CLBuildOptions build_opts;
-  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.add_option("-DDATA_TYPE_OUT=" +
-                        get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.add_option_if(
-      multi_access_x, "-DLAST_ACCESSED_X=" +
-                          support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
-}
-
-Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
-                                               const ITensorInfo *scale_factor,
-                                               const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
-  return Status{};
-}
-
-void CLQuantizationSymmetricKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  // Support only 2D
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = window_collapsed.first_slice_window_2D();
-
-  do
-  {
-    Window scale_slice = slice.shift_dimensions(1);
-
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input, slice);
-    add_1D_tensor_argument(idx, _scale_factor, scale_slice);
-    add_2D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice, lws_hint());
-  } while (window_collapsed.slide_window_slice_2D(slice));
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
deleted file mode 100644
index 06c2579f2..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-namespace
-{
-// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
-// are the same.
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
-{
-  TensorShape out_shape{input_shape};
-
-  out_shape.set(axis, 1);
-
-  return out_shape;
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
-                          ReduceOperation op)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32, DataType::S32);
-  if (op == ReduceOperation::SUM)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
-                                    "Not support QASYMM8, yet");
-  }
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  const auto num_dimensions = input->tensor_shape().num_dimensions();
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
-
-  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
-                                  "output shape's size does not match axis");
-
-  return Status{};
-}
-} // namespace
-
-CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
-
-void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                        const uint32_t axis, ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
-  _input = input;
-  _output = output;
-  _axis = axis;
-
-  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
-  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
-
-  // Construct kernel name
-  std::string kernel_name;
-  int op_code = 0;
-  if (op == ReduceOperation::MAX)
-  {
-    kernel_name = "reduce_min_max";
-    op_code = 1;
-  }
-  else if (op == ReduceOperation::MIN)
-  {
-    kernel_name = "reduce_min_max";
-    op_code = 2;
-  }
-  else if (op == ReduceOperation::SUM)
-  {
-    kernel_name = "reduce_sum_mean";
-    op_code = 3;
-  }
-  else if (op == ReduceOperation::MEAN)
-  {
-    kernel_name = "reduce_sum_mean";
-    op_code = 4;
-  }
-  else
-    throw std::runtime_error("Operation not supported, yet");
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
-  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output_info, Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output_info->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                         const uint32_t axis, ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-
-  return Status{};
-}
-
-void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &shape_in = _input->info()->tensor_shape();
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
-  _kernel.setArg<cl_int>(idx++, _axis);
-  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
-
-  // Support dimensions up to 4
-  Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Copy output's shape in order to use for recovering at end of this method
-  // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
-  // of input and output are the same
-  const TensorShape shape_out = _output->info()->tensor_shape();
-  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
-
-  idx = 0;
-  add_4D_tensor_argument(idx, _input, slice_in);
-  add_4D_tensor_argument(idx, _output, slice_out);
-  enqueue(queue, *this, slice_out, lws_hint());
-
-  // Recover output's shape of output tensor
-  _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
deleted file mode 100644
index 8d8853c81..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
-
-  if (output->tensor_shape().total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    TensorShape output_shape = TensorShape{input->dimension(1)};
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  TensorShape output_shape = TensorShape{input->dimension(1)};
-
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty(*output, output_shape, 1, input->data_type());
-
-  const unsigned int num_elems_processed_per_iteration = 1;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-  AccessWindowStatic output_access(output, 0, 0, output->dimension(0), 1);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_tuple(err, win);
-}
-} // namespace
-
-CLScaleFactorSymm8Kernel::CLScaleFactorSymm8Kernel() : _input(nullptr), _output(nullptr) {}
-
-void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-  _input = input;
-  _output = output;
-
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
-
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  ICLKernel::configure_internal(std::get<1>(win_config));
-}
-
-Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
-  return Status{};
-}
-
-void CLScaleFactorSymm8Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = window_collapsed.first_slice_window_2D();
-  slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  do
-  {
-    Window output_slice = slice.shift_dimensions(1);
-
-    unsigned int idx = 0;
-    // Set inputs
-    add_2D_tensor_argument(idx, _input, slice);
-    add_1D_tensor_argument(idx, _output, output_slice);
-    enqueue(queue, *this, slice, lws_hint());
-  } while (window_collapsed.slide_window_slice_2D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
deleted file mode 100644
index 151d45e8d..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-// Disable GPU implementation
-// TODO Enable GPU implementation with verification, or remove code
-//      Invalid result on GPU
-#if 0
-namespace arm_compute
-{
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
-
-void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
-                               cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
-{
-  ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  _input = input;
-  _topk_values = topk_values;
-  _topk_indices = topk_indices;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
-
-  unsigned int idx = 3 * num_arguments_per_1D_tensor();
-  _kernel.setArg(idx++, *indices);
-  _kernel.setArg(idx++, *temp_stack);
-  _kernel.setArg<cl_int>(idx++, k);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, 1, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _input, window);
-  add_1D_tensor_argument(idx, _topk_values, window);
-  add_1D_tensor_argument(idx, _topk_indices, window);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
-
-void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
-                             int n)
-{
-  ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  _input = input;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
-
-  unsigned int idx = num_arguments_per_1D_tensor();
-  _kernel.setArg(idx++, *in_key_buf);
-  _kernel.setArg(idx++, *in_ind_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _input, window);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// This kernel makes a histogram of radix for each work item.
-CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
-
-void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
-
-  int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
-
-  unsigned int idx = 1;
-  _kernel.setArg(idx++, *hist_buf);
-
-  idx = 3;
-  _kernel.setArg(idx++, loc_histo_size, nullptr);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  _kernel.setArg(0, *_in_key_buf);
-  _kernel.setArg<cl_int>(2, _pass);
-
-  cl::NDRange lws = cl::NDRange(_ITEMS, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
-
-void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
-
-  int temp_size =
-      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *hist_buf);
-  _kernel.setArg(idx++, temp_size, nullptr);
-  _kernel.setArg(idx++, *glob_sum_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
-
-void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
-                                               int bits)
-{
-  ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
-
-  int temp_size =
-      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *glob_sum_buf);
-  _kernel.setArg(idx++, temp_size, nullptr);
-  _kernel.setArg(idx++, *temp_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
-
-void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *hist_buf);
-  _kernel.setArg(idx++, *glob_sum_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortReorder::CLRadixSortReorder()
-    : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
-      _out_ind_buf(nullptr)
-{
-}
-
-void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
-
-  unsigned int idx = 2;
-  _kernel.setArg(idx++, *hist_buf);
-
-  idx = 6;
-  _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
-  cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
-
-  _kernel.setArg(0, *_in_key_buf);
-  _kernel.setArg(1, *_out_key_buf);
-  _kernel.setArg<cl_int>(3, _pass);
-  _kernel.setArg(4, *_in_ind_buf);
-  _kernel.setArg(5, *_out_ind_buf);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
-
-void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
-{
-  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
-
-  unsigned int idx = 1;
-  _kernel.setArg(idx++, *first_negative_idx_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *_out_key_buf);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
-    : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
-{
-}
-
-void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
-{
-  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
-
-  unsigned int idx = 4;
-  _kernel.setArg(idx++, *first_negative_idx_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *_in_key_buf);
-  _kernel.setArg(idx++, *_out_key_buf);
-  _kernel.setArg(idx++, *_in_ind_buf);
-  _kernel.setArg(idx++, *_out_ind_buf);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Store::CLTopKV2Store()
-    : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
-{
-}
-
-void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
-{
-  ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(k == 0);
-  ARM_COMPUTE_ERROR_ON(k > n);
-
-  _values = values;
-  _indices = indices;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
-
-  unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, k, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
-{
-  _out_key_buf = out_key_buf;
-  _out_ind_buf = out_ind_buf;
-}
-
-void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _values, window);
-  add_1D_tensor_argument(idx, _indices, window);
-  _kernel.setArg(idx++, *_out_key_buf);
-  _kernel.setArg(idx++, *_out_ind_buf);
-
-  enqueue(queue, *this, window);
-}
-
-} // namespace arm_compute
-#endif // Disable GPU implementation
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
deleted file mode 100644
index dfe5d59b0..000000000
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
-
-#include <algorithm>
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-
-namespace
-{
-
-using namespace arm_compute;
-template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op_templ(
-    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
-                          OutputScalarType *, const bool),
-    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
-                     OutputScalarType *))
-{
-  // Create input windows
-  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-  // Clear X Dimension on execution window as we handle manually
-  Window win = window;
-  win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-  const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
-
-  if (is_broadcast_across_x)
-  {
-    const bool is_broadcast_input_2 = input2_win.x().step() == 0;
-    Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
-    Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
-    const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
-    const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-    // Clear X Dimension on execution window as we handle manually
-    non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator broadcast_input(broadcast_tensor, broadcast_win);
-    Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-    Iterator output(out, win);
-
-    execute_window_loop(win,
-                        [&](const Coordinates &) {
-                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-                          const auto non_broadcast_input_ptr =
-                              reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-                          const InputScalarType broadcast_value =
-                              *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-                          int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
-                                                    non_broadcast_input_ptr, broadcast_value,
-                                                    output_ptr, !is_broadcast_input_2);
-                          for (; x < window_end_x; ++x)
-                          {
-                            const auto a = *(non_broadcast_input_ptr + x);
-                            *(output_ptr + x) =
-                                (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
-                                               !is_broadcast_input_2 ? a : broadcast_value);
-                          }
-                        },
-                        broadcast_input, non_broadcast_input, output);
-  }
-  else
-  {
-    // Clear X Dimension on execution window as we handle manually
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(in1, input1_win);
-    Iterator input2(in2, input2_win);
-    Iterator output(out, win);
-
-    execute_window_loop(win,
-                        [&](const Coordinates &) {
-                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-                          const auto input1_ptr =
-                              reinterpret_cast<const InputScalarType *>(input1.ptr());
-                          const auto input2_ptr =
-                              reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-                          int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
-                                               input1_ptr, input2_ptr, output_ptr);
-                          for (; x < window_end_x; ++x)
-                          {
-                            const auto a = *(input1_ptr + x);
-                            const auto b = *(input2_ptr + x);
-                            *(output_ptr + x) = (*scalar_func)(a, b);
-                          }
-                        },
-                        input1, input2, output);
-  }
-}
-
-} // namespace
-
-namespace arm_compute
-{
-
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    float (*scalar_func)(const float &, const float &),
-                    int (*broadcast_func)(int, int, int, const float *, const float &, float *,
-                                          const bool),
-                    int (*neon_func)(int, int, int, const float *, const float *, float *))
-{
-  elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func,
-                                                  broadcast_func, neon_func);
-}
-
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
-                    int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
-                                          uint8_t *, const bool),
-                    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *))
-{
-  elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func,
-                                                     broadcast_func, neon_func);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp
deleted file mode 100644
index 648705ba9..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp
+++ /dev/null
@@ -1,730 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <array>
-#include <cmath>
-#include <map>
-#include <set>
-
-using namespace arm_compute;
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ActivationLayerInfo &activation_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
-
-  static std::set<ActivationLayerInfo::ActivationFunction> qasymm8_supported_activations = {
-      ActivationLayerInfo::ActivationFunction::RELU,
-      ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-      ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-      ActivationLayerInfo::ActivationFunction::LOGISTIC,
-      ActivationLayerInfo::ActivationFunction::TANH};
-  static std::set<ActivationLayerInfo::ActivationFunction> qsymm16_supported_activations = {
-      ActivationLayerInfo::ActivationFunction::LOGISTIC,
-      ActivationLayerInfo::ActivationFunction::TANH};
-  const DataType data_type = input->data_type();
-  const QuantizationInfo &oq_info =
-      (output != nullptr) ? output->quantization_info() : input->quantization_info();
-  const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      is_data_type_quantized_asymmetric(data_type) &&
-          (qasymm8_supported_activations.count(f_act) == 0),
-      "For QASYMM8 only tanh, logistic, relu and lower/upper bounded relu are supported");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) &&
-                                      (qsymm16_supported_activations.count(f_act) == 0),
-                                  "For QSYMM16 only tanh and logistic are supported");
-  ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) &&
-                              (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
-                              (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-  ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) &&
-                              (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
-                              (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-  ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
-                              (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
-                              (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-  ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
-                              (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
-                              (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
-  // Checks performed when output is configured
-  if ((output != nullptr) && (output->total_size() != 0))
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  // Configure kernel window
-  Window win = calculate_max_window(*input, Steps());
-
-  if (output != nullptr)
-  {
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, *input->clone());
-
-    // NEActivationLayerKernelEx doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(output->num_dimensions());
-    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-  }
-
-  return std::make_pair(Status{}, win);
-}
-
-inline uint32x4_t vreinterpret_unsigend_int(const float32x4_t &vec)
-{
-  return vreinterpretq_u32_f32(vec);
-}
-
-inline float32x4_t vreinterpret_floating_point(const uint32x4_t &vec)
-{
-  return vreinterpretq_f32_u32(vec);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline uint16x8_t vreinterpret_unsigend_int(const float16x8_t &vec)
-{
-  return vreinterpretq_u16_f16(vec);
-}
-inline float16x8_t vreinterpret_floating_point(const uint16x8_t &vec)
-{
-  return vreinterpretq_f16_u16(vec);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-} // namespace
-
-NEActivationLayerKernelEx::NEActivationLayerKernelEx()
-    : _input(nullptr), _output(nullptr), _func(nullptr), _act_info()
-{
-}
-
-void NEActivationLayerKernelEx::configure(ITensor *input, ITensor *output,
-                                          ActivationLayerInfo activation_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _input = input;
-  _act_info = activation_info;
-  _output = input;
-
-  // Out-of-place calculation
-  if (output != nullptr)
-  {
-    _output = output;
-  }
-
-  // Disabled activation, thus no operation needed
-  if (!activation_info.enabled())
-  {
-    _func = nullptr;
-  }
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
-      input->info(), (output != nullptr) ? output->info() : nullptr, activation_info));
-
-  // Activation functions : FP32
-  static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 = {
-      {ActivationFunction::ABS,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::ABS, float>},
-      {ActivationFunction::LINEAR,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LINEAR, float>},
-      {ActivationFunction::LOGISTIC,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, float>},
-      {ActivationFunction::RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, float>},
-      {ActivationFunction::BOUNDED_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, float>},
-      {ActivationFunction::LU_BOUNDED_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, float>},
-      {ActivationFunction::LEAKY_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LEAKY_RELU, float>},
-      {ActivationFunction::SOFT_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::SOFT_RELU, float>},
-      {ActivationFunction::ELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::ELU, float>},
-      {ActivationFunction::SQRT,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::SQRT, float>},
-      {ActivationFunction::SQUARE,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::SQUARE, float>},
-      {ActivationFunction::TANH,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, float>},
-      {ActivationFunction::IDENTITY,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, float>},
-  };
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-  // Activation functions : FP16
-  static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f16 = {
-      {ActivationFunction::ABS,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::ABS, float16_t>},
-      {ActivationFunction::LINEAR,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LINEAR, float16_t>},
-      {ActivationFunction::LOGISTIC,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, float16_t>},
-      {ActivationFunction::RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, float16_t>},
-      {ActivationFunction::BOUNDED_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, float16_t>},
-      {ActivationFunction::LU_BOUNDED_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, float16_t>},
-      {ActivationFunction::LEAKY_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LEAKY_RELU, float16_t>},
-      {ActivationFunction::SOFT_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::SOFT_RELU, float16_t>},
-      {ActivationFunction::ELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::ELU, float16_t>},
-      {ActivationFunction::SQRT,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::SQRT, float16_t>},
-      {ActivationFunction::SQUARE,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::SQUARE, float16_t>},
-      {ActivationFunction::TANH,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, float16_t>},
-      {ActivationFunction::IDENTITY,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, float16_t>},
-  };
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-
-  // Activation functions : QASYMM8
-  static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 = {
-      {ActivationFunction::LOGISTIC,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, qasymm8_t>},
-      {ActivationFunction::BOUNDED_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, qasymm8_t>},
-      {ActivationFunction::LU_BOUNDED_RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t>},
-      {ActivationFunction::RELU,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, qasymm8_t>},
-      {ActivationFunction::TANH,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, qasymm8_t>},
-      {ActivationFunction::IDENTITY,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, qasymm8_t>},
-  };
-
-  // Activation functions : QSYMM16
-  static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qsymm16 = {
-      {ActivationFunction::LOGISTIC,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, qsymm16_t>},
-      {ActivationFunction::TANH,
-       &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, qsymm16_t>},
-  };
-
-  switch (input->info()->data_type())
-  {
-    case DataType::QASYMM8:
-      _func = act_map_qasymm8[activation_info.activation()];
-      break;
-    case DataType::QSYMM16:
-      _func = act_map_qsymm16[activation_info.activation()];
-      break;
-    case DataType::F32:
-      _func = act_map_f32[activation_info.activation()];
-      break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    case DataType::F16:
-      _func = act_map_f16[activation_info.activation()];
-      break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    default:
-      ARM_COMPUTE_ERROR("Unsupported data type.");
-  }
-
-  // Configure kernel window
-  auto win_config =
-      validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICPPKernel::configure(win_config.second);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-NEActivationLayerKernelEx::activation(const Window &window)
-{
-  /** NEON vector tag type. */
-  using ExactTagType =
-      typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-  const int window_step_x = 16 / sizeof(T);
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-  const ActivationFunction act = F;
-
-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  Iterator input(_input, win_collapsed);
-  Iterator output(_output, win_collapsed);
-
-  const auto infinity = wrapper::vdup_n(std::numeric_limits<T>::infinity(), ExactTagType{});
-  const auto epsilon = wrapper::vdup_n(static_cast<T>(1e-24), ExactTagType{});
-  const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
-  const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-  const auto va = wrapper::vdup_n(static_cast<T>(_act_info.a()), ExactTagType{});
-  const auto vb = wrapper::vdup_n(static_cast<T>(_act_info.b()), ExactTagType{});
-  const auto a = static_cast<T>(_act_info.a());
-  const auto b = static_cast<T>(_act_info.b());
-
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &) {
-        const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-
-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-          const auto vin = wrapper::vloadq(input_ptr + x);
-          switch (act)
-          {
-            case ActivationFunction::ABS:
-              tmp = wrapper::vabs(vin);
-              break;
-            case ActivationFunction::LINEAR:
-              tmp = wrapper::vmla(vb, va, vin);
-              break;
-            case ActivationFunction::LOGISTIC:
-              // exp(-vin)
-              tmp = wrapper::vexpq(wrapper::vneg(vin));
-
-              // NaN -> INF
-              tmp = vreinterpret_floating_point(wrapper::vorr(
-                  wrapper::vand(wrapper::vnot(wrapper::vceq(tmp, tmp)),
-                                vreinterpret_unsigend_int(infinity)),
-                  wrapper::vand(wrapper::vceq(tmp, tmp), vreinterpret_unsigend_int(tmp))));
-
-              // 1 / 1 + tmp
-              tmp = wrapper::vinv(wrapper::vadd(const_1, tmp));
-              break;
-            case ActivationFunction::RELU:
-              tmp = wrapper::vmax(const_0, vin);
-              break;
-            case ActivationFunction::BOUNDED_RELU:
-              tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-              break;
-            case ActivationFunction::LU_BOUNDED_RELU:
-              tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-              break;
-            case ActivationFunction::LEAKY_RELU:
-              tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-              break;
-            case ActivationFunction::SOFT_RELU:
-              tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
-              break;
-            case ActivationFunction::ELU:
-              tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
-                                  wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-              break;
-            case ActivationFunction::SQRT:
-              tmp = wrapper::vinv(wrapper::vinvsqrt(vin + epsilon));
-              break;
-            case ActivationFunction::SQUARE:
-              tmp = wrapper::vmul(vin, vin);
-              break;
-            case ActivationFunction::TANH:
-              tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-              break;
-            case ActivationFunction::IDENTITY:
-              tmp = vin;
-              break;
-            default:
-              ARM_COMPUTE_ERROR("Unsupported activation function");
-          }
-          wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          const T in = *(reinterpret_cast<const T *>(input_ptr + x));
-          T tmp;
-          switch (act)
-          {
-            case ActivationFunction::ABS:
-              tmp = std::abs(in);
-              break;
-            case ActivationFunction::LINEAR:
-              tmp = a * in + b;
-              break;
-            case ActivationFunction::LOGISTIC:
-              tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
-              break;
-            case ActivationFunction::RELU:
-              tmp = std::max<T>(static_cast<T>(0), in);
-              break;
-            case ActivationFunction::BOUNDED_RELU:
-              tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
-              break;
-            case ActivationFunction::LU_BOUNDED_RELU:
-              tmp = std::min<T>(a, std::max<T>(b, in));
-              break;
-            case ActivationFunction::LEAKY_RELU:
-              tmp = (in > 0) ? in : a * in;
-              break;
-            case ActivationFunction::SOFT_RELU:
-              tmp = std::log(static_cast<T>(1) + std::exp(in));
-              break;
-            case ActivationFunction::ELU:
-              tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-              break;
-            case ActivationFunction::SQRT:
-              tmp = std::sqrt(in);
-              break;
-            case ActivationFunction::SQUARE:
-              tmp = in * in;
-              break;
-            case ActivationFunction::TANH:
-              tmp = a * std::tanh(b * in);
-              break;
-            case ActivationFunction::IDENTITY:
-              tmp = in;
-              break;
-            default:
-              ARM_COMPUTE_ERROR("Unsupported activation function");
-          }
-          *(output_ptr + x) = tmp;
-        }
-      },
-      input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type
-NEActivationLayerKernelEx::activation(const Window &window)
-{
-  const int window_step_x = 16 / sizeof(T);
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-  const ActivationFunction act = F;
-
-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  Iterator input(_input, win_collapsed);
-  Iterator output(_output, win_collapsed);
-
-  const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform();
-  const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform();
-  const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(_act_info.a(), qi_in));
-  const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(_act_info.b(), qi_in));
-  const qasymm8_t a = quantize_qasymm8(_act_info.a(), qi_in);
-  const qasymm8_t b = quantize_qasymm8(_act_info.b(), qi_in);
-  const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
-  const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
-  const auto vconst_1 = vdupq_n_f32(1.f);
-  const float32x4_t va_f32 = vdupq_n_f32(_act_info.a());
-  const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b());
-  const float a_f32 = _act_info.a();
-  const float b_f32 = _act_info.b();
-
-  // Initialise scale/offset for re-quantization
-  float s = qi_in.scale / qi_out.scale;
-  float o = -qi_in.offset * s + qi_out.offset;
-  float32x4_t vs = vdupq_n_f32(s);
-  float32x4_t vo = vdupq_n_f32(o);
-
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &) {
-        const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-          const auto vin = wrapper::vloadq(input_ptr + x);
-          if (act == ActivationFunction::RELU)
-          {
-            // Perform activation
-            tmp = vmaxq_u8(vconst_0, vin);
-            // Re-quantize to new output space
-            tmp = vmlaq_qasymm8(tmp, vs, vo);
-          }
-          else if (act == ActivationFunction::BOUNDED_RELU)
-          {
-            // Perform activation
-            tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
-            // Re-quantize to new output space
-            tmp = vmlaq_qasymm8(tmp, vs, vo);
-          }
-          else if (act == ActivationFunction::LU_BOUNDED_RELU)
-          {
-            // Perform activation
-            tmp = vminq_u8(va, vmaxq_u8(vb, vin));
-            // Re-quantize to new output space
-            tmp = vmlaq_qasymm8(tmp, vs, vo);
-          }
-          else if (act == ActivationFunction::LOGISTIC)
-          {
-            // De-quantize
-            const auto vin_deq = vdequantize(vin, qi_in);
-            // Perform activation
-            const float32x4x4_t tmp_dep = {{
-                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
-                                                                    vin_deq.val[0])))),
-                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
-                                                                    vin_deq.val[1])))),
-                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
-                                                                    vin_deq.val[2])))),
-                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
-                                                                    vin_deq.val[3])))),
-            }};
-            // Re-quantize to new output space
-            tmp = vquantize(tmp_dep, qi_out);
-          }
-          else if (act == ActivationFunction::TANH)
-          {
-            // De-quantize
-            const auto vin_deq = vdequantize(vin, qi_in);
-            // Perform activation
-            const float32x4x4_t tmp_dep = {{
-                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-            }};
-            // Re-quantize to new output space
-            tmp = vquantize(tmp_dep, qi_out);
-          }
-          else
-          {
-            ARM_COMPUTE_ERROR("Unsupported activation function");
-          }
-          wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          T in = *(reinterpret_cast<const T *>(input_ptr + x));
-          T tmp;
-          if (act == ActivationFunction::RELU)
-          {
-            tmp = std::max(const_0, in);
-            tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
-          }
-          else if (act == ActivationFunction::BOUNDED_RELU)
-          {
-            tmp = std::min(a, std::max(const_0, in));
-            tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
-          }
-          else if (act == ActivationFunction::LU_BOUNDED_RELU)
-          {
-            tmp = std::min(a, std::max(b, in));
-            tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
-          }
-          else if (act == ActivationFunction::LOGISTIC)
-          {
-            float tmp_f = dequantize_qasymm8(in, qi_in);
-            tmp_f = 1.f / (1.f + std::exp(-tmp_f));
-            tmp = quantize_qasymm8(tmp_f, qi_out);
-          }
-          else if (act == ActivationFunction::TANH)
-          {
-            float tmp_f = dequantize_qasymm8(in, qi_in);
-            tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
-            tmp = quantize_qasymm8(tmp_f, qi_out);
-          }
-          else
-          {
-            ARM_COMPUTE_ERROR("Unsupported activation function");
-          }
-          *(output_ptr + x) = tmp;
-        }
-      },
-      input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type
-NEActivationLayerKernelEx::activation(const Window &window)
-{
-  const int window_step_x = 16 / sizeof(T);
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-  const ActivationFunction act = F;
-
-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  Iterator input(_input, win_collapsed);
-  Iterator output(_output, win_collapsed);
-
-  const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform();
-  const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform();
-  const auto vconst_1 = vdupq_n_f32(1.f);
-  const float32x4_t va_f32 = vdupq_n_f32(_act_info.a());
-  const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b());
-  const float a_f32 = _act_info.a();
-  const float b_f32 = _act_info.b();
-
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &) {
-        const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-        ARM_COMPUTE_UNUSED(tmp);
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-          const auto vin = wrapper::vloadq(input_ptr + x);
-          if (act == ActivationFunction::LOGISTIC)
-          {
-            // De-quantize
-            const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-            // Perform activation
-            const float32x4x2_t tmp_dep = {{
-                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
-                                                                    vin_deq.val[0])))),
-                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
-                                                                    vin_deq.val[1])))),
-            }};
-            // Re-quantize to new output space
-            tmp = vquantize_int16(tmp_dep, qi_out.scale);
-          }
-          else if (act == ActivationFunction::TANH)
-          {
-            // De-quantize
-            const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-            // Perform activation
-            const float32x4x2_t tmp_dep = {{
-                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-            }};
-            // Re-quantize to new output space
-            tmp = vquantize_int16(tmp_dep, qi_out.scale);
-          }
-          else
-          {
-            ARM_COMPUTE_ERROR("Unsupported activation function");
-          }
-          wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          T in = *(reinterpret_cast<const T *>(input_ptr + x));
-          T tmp;
-          if (act == ActivationFunction::LOGISTIC)
-          {
-            float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-            tmp_f = 1.f / (1.f + std::exp(-tmp_f));
-            tmp = quantize_qsymm16(tmp_f, qi_out);
-          }
-          else if (act == ActivationFunction::TANH)
-          {
-            float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-            tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
-            tmp = quantize_qsymm16(tmp_f, qi_out);
-          }
-          else
-          {
-            ARM_COMPUTE_ERROR("Unsupported activation function");
-          }
-          *(output_ptr + x) = tmp;
-        }
-      },
-      input, output);
-}
-
-Status NEActivationLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ActivationLayerInfo &act_info)
-{
-  ARM_COMPUTE_UNUSED(act_info);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(),
-                                    (output != nullptr) ? output->clone().get() : nullptr)
-          .first);
-
-  return Status{};
-}
-
-void NEActivationLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  // Early exit on disabled activation
-  if (!_act_info.enabled())
-  {
-    return;
-  }
-
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-  ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-  (this->*_func)(window);
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
deleted file mode 100644
index 32d7d6237..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <map>
-#include <string>
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace arm_compute
-{
-
-template <BinaryLogicalOperation op, typename ScalarType>
-inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b)
-{
-  auto res = ScalarType(0);
-
-  switch (op)
-  {
-    case BinaryLogicalOperation::AND:
-      res = a & b;
-      break;
-    case BinaryLogicalOperation::OR:
-      res = a | b;
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-  return res;
-}
-
-template <BinaryLogicalOperation op, typename VectorType>
-inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b)
-{
-  VectorType res = {0, 0, 0, 0};
-
-  switch (op)
-  {
-    case BinaryLogicalOperation::AND:
-      res = wrapper::vand(a, b);
-      break;
-    case BinaryLogicalOperation::OR:
-      res = wrapper::vorr(a, b);
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-  return res;
-}
-
-template <BinaryLogicalOperation op>
-inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b)
-{
-  uint8x16x4_t out = {{
-      elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]),
-      elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]),
-  }};
-  return out;
-}
-
-template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
-inline VectorType elementwise_logic_op_broadcast(const VectorType &a,
-                                                 const ScalarType &broadcast_value,
-                                                 const bool reorder)
-{
-  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-  return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                     const ScalarType *input1_ptr, const ScalarType *input2_ptr,
-                                     ScalarType *output_ptr)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    const auto a = wrapper::vloadq(input1_ptr + x);
-    const auto b = wrapper::vloadq(input2_ptr + x);
-    wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b));
-  }
-  return x;
-}
-
-template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x,
-                                               int window_step_x,
-                                               const ScalarType *non_broadcast_input_ptr,
-                                               const ScalarType &broadcast_value,
-                                               ScalarType *output_ptr, const bool reorder)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-    wrapper::vstore(output_ptr + x,
-                    elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder));
-  }
-  return x;
-}
-
-template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
-void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
-                          const Window &window)
-{
-  elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>,
-                 &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>,
-                 &elementwise_logic_op_loop<op, ScalarType, VectorType>);
-}
-
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
-    const ITensor *input1, const ITensor *input2, ITensor *output,
-    std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
-{
-  std::string function_to_call("op_");
-  function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
-  function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
-  function_to_call += string_from_data_type(output->info()->data_type());
-
-  auto it = map_function.find(function_to_call);
-
-  if (it != map_function.end())
-  {
-    auto func = it->second;
-    return [func](const ITensor *input1, const ITensor *input2, ITensor *output,
-                  const Window &window) { func(input1, input2, output, window); };
-  }
-  return nullptr;
-}
-
-template <BinaryLogicalOperation op>
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
-{
-  static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
-      {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
-      {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
-
-  return configure_func(input1, input2, output, map_function);
-}
-
-void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1,
-                                               const ITensor *input2, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
-  configure_common(input1, input2, output);
-  switch (op)
-  {
-    case BinaryLogicalOperation::AND:
-      _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output);
-      break;
-    case BinaryLogicalOperation::OR:
-      _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output);
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-}
-
-Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1,
-                                                          const ITensorInfo &input2,
-                                                          const ITensorInfo &output)
-{
-  // Validate in case of configured output
-  if (output.total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8,
-                                                         DataType::QASYMM8);
-  }
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
-
-  const TensorShape out_shape =
-      TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Validate in case of configured output
-  if (output.total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-
-  return Status{};
-}
-
-Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
-                                                const ITensorInfo *input1,
-                                                const ITensorInfo *input2,
-                                                const ITensorInfo *output)
-{
-  ARM_COMPUTE_UNUSED(op);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
-  return Status{};
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
deleted file mode 100644
index 12017e543..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input == output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
-                                                       DataType::S16, DataType::U16, DataType::F16,
-                                                       DataType::U32, DataType::S32, DataType::F32);
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-} // namespace
-
-NECastBoolKernel::NECastBoolKernel() : _input(nullptr), _output(nullptr) {}
-
-void NECastBoolKernel::configure(const ITensor *input, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype
-  // must be given)
-  set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-  _input = input;
-  _output = output;
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICPPKernel::configure(win);
-}
-
-Status NECastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-  return Status{};
-}
-
-void NECastBoolKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-  ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
-  ARM_COMPUTE_ERROR_ON(_input == _output);
-
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-  const int window_step_x = 16;
-
-  Window win{window};
-  win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  Iterator input(_input, win);
-  Iterator output(_output, win);
-
-  const uint8_t true_val = 1;
-  const uint8x8_t mask_bool = vdup_n_u8(true_val);
-
-  switch (_output->info()->data_type())
-  {
-    case DataType::S8:
-    {
-      /* Conversion U8 -> S8 */
-      execute_window_loop(win,
-                          [&](const Coordinates &) {
-                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-                            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-                            int x = window_start_x;
-                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-                              vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8(
-                                                           texels_u8, vdupq_n_u8(true_val))));
-                            }
-
-                            // Compute left-over elements
-                            for (; x < window_end_x; ++x)
-                            {
-                              *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
-                            }
-                          },
-                          input, output);
-      break;
-    }
-    case DataType::S16:
-    {
-      /* Up-conversion U8 -> S16 */
-      execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-
-              vst1q_s16(output_ptr + x, texels.val[0]);
-              vst1q_s16(output_ptr + x + 8, texels.val[1]);
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
-      break;
-    }
-    case DataType::S32:
-    {
-      /* Up-conversion U8 -> S32 */
-      execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-
-              vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-              vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-              vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-              vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
-      break;
-    }
-    case DataType::F32:
-    {
-      /* Up-conversion U8 -> F32 */
-      execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-              vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-              vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-              vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-              vst1q_f32(output_ptr + x + 12,
-                        vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
-              *(output_ptr + x) = static_cast<float>(in);
-            }
-          },
-          input, output);
-      break;
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    case DataType::F16:
-    {
-      /* Up-conversion U8 -> F16 */
-      execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-              vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
-              vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
-      break;
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    case DataType::U8:
-    {
-      /* Conversion U8 -> S8 */
-      execute_window_loop(win,
-                          [&](const Coordinates &) {
-                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-                            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-                            int x = window_start_x;
-                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-                              vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
-                            }
-
-                            // Compute left-over elements
-                            for (; x < window_end_x; ++x)
-                            {
-                              *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
-                            }
-                          },
-                          input, output);
-      break;
-    }
-    case DataType::U16:
-    {
-      /* Up-conversion U8 -> U16 */
-      execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
-                                            vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
-
-              vst1q_u16(output_ptr + x, texels.val[0]);
-              vst1q_u16(output_ptr + x + 8, texels.val[1]);
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
-      break;
-    }
-    default:
-      ARM_COMPUTE_ERROR("Output data type not supported");
-  }
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
deleted file mode 100644
index 091d38c56..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
-    : _input(nullptr), _lookups(nullptr), _output(nullptr)
-{
-}
-
-void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output,
-                                        const ITensor *lookups)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
-
-  _input = input;
-  _output = output;
-  _lookups = lookups;
-
-  // Auto initialize output if not initialized
-  auto out_shape = input->info()->tensor_shape();
-  out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions());
-  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
-                     input->info()->quantization_info());
-
-  INEKernel::configure(calculate_max_window(*output->info()));
-}
-
-Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input,
-                                         const arm_compute::ITensorInfo *output,
-                                         const arm_compute::ITensorInfo *lookups)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
-
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
-    for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
-    {
-      ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
-    }
-  }
-
-  return Status{};
-}
-
-void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-  const size_t lookup_dim = _output->info()->num_dimensions() - 1;
-
-  Window output_window{window};
-  output_window.set(Window::DimX,
-                    Window::Dimension(output_window.x().start(), output_window.x().end(),
-                                      _input->info()->dimension(0)));
-
-  Window out_slice = output_window.first_slice_window_4D();
-  do
-  {
-    Iterator output_it(_output, out_slice);
-
-    execute_window_loop(out_slice,
-                        [&](const Coordinates &id) {
-                          const int32_t lookup = *reinterpret_cast<int32_t *>(
-                              _lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
-                          Coordinates input_id{id};
-                          input_id.set(lookup_dim, lookup);
-                          memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
-                                 _output->info()->dimension(0) * _output->info()->element_size());
-                        },
-                        output_it);
-
-  } while (window.slide_window_slice_4D(out_slice));
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
deleted file mode 100644
index 93963a504..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-
-namespace arm_compute
-{
-namespace
-{
-/** Validate the indices
- *
- * Validate that indices are not negative
- *
- * @param[in] indices Indices tensor info.
- */
-template <typename U> void validate_indices(const ITensor *indices)
-{
-  for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
-  {
-    ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
-  }
-}
-
-} // namespace
-
-NEGatherKernelEx::NEGatherKernelEx()
-    : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{}
-{
-}
-
-template <typename U>
-inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-
-  // Validate that the indices are not negative
-  validate_indices<U>(_indices);
-
-  Iterator output_it(_output, window);
-  execute_window_loop(
-      window,
-      [&](const Coordinates &id) {
-        Coordinates gather_id(id);
-        gather_id.collapse(_indices_rank);
-
-        U new_index;
-        switch (_indices_rank)
-        {
-          case 1:
-            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
-            break;
-          case 2:
-            new_index =
-                *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
-            break;
-          case 3:
-            new_index = *(
-                reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
-            break;
-          default:
-            ARM_COMPUTE_ERROR("Wrong num of dimensions");
-            break;
-        }
-
-        gather_id.set(0, new_index);
-
-        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
-                    output_it.ptr());
-      },
-      output_it);
-}
-
-template <typename U>
-void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-
-  // Validate that the indices are not negative
-  validate_indices<U>(_indices);
-
-  Window output_window{window};
-  output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  Iterator output_it(_output, output_window);
-  execute_window_loop(
-      output_window,
-      [&](const Coordinates &id) {
-        Coordinates gather_id(id);
-        gather_id.collapse(_indices_rank, _axis);
-
-        U new_index;
-        switch (_indices_rank)
-        {
-          case 1:
-            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
-            break;
-          case 2:
-            new_index = *(reinterpret_cast<U *>(
-                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
-            break;
-          case 3:
-            new_index = *(reinterpret_cast<U *>(
-                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
-            break;
-          default:
-            ARM_COMPUTE_ERROR("Wrong num of dimensions");
-            break;
-        }
-
-        gather_id.set(_axis, new_index);
-
-        std::copy_n(_input->ptr_to_element(gather_id),
-                    _input->info()->dimension(0) * _output->info()->element_size(),
-                    output_it.ptr());
-      },
-      output_it);
-}
-
-void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
-                                 int axis)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
-  ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-
-  _input = input;
-  _indices = indices;
-  _output = output;
-  _axis = axis;
-  _indices_rank = indices->info()->num_dimensions();
-
-  if (_axis < 0)
-  {
-    _axis += input->info()->num_dimensions();
-  }
-  ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
-
-  if (0 == _axis)
-  {
-    switch (_indices->info()->data_type())
-    {
-      case DataType::U32:
-        _func = &NEGatherKernelEx::gather_0_axis<uint32_t>;
-        break;
-      case DataType::S32:
-        _func = &NEGatherKernelEx::gather_0_axis<int32_t>;
-        break;
-      default:
-        ARM_COMPUTE_ERROR("Not supported");
-        break;
-    }
-  }
-  else
-  {
-    switch (_indices->info()->data_type())
-    {
-      case DataType::U32:
-        _func = &NEGatherKernelEx::gather_n_axis<uint32_t>;
-        break;
-      case DataType::S32:
-        _func = &NEGatherKernelEx::gather_n_axis<int32_t>;
-        break;
-      default:
-        ARM_COMPUTE_ERROR("Not supported");
-        break;
-    }
-  }
-  // Output auto initialization if not yet initialized
-  TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-      input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
-  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
-  // Create window
-  Window win = calculate_max_window(*output->info(), Steps());
-  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-  INEKernel::configure(win);
-}
-
-Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
-                                  const ITensorInfo *output, int axis)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
-
-  if (axis < 0)
-  {
-    axis += input->num_dimensions();
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-        input->tensor_shape(), indices->tensor_shape(), axis);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
-
-  return Status{};
-}
-
-void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-  (this->*_func)(window, info);
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
deleted file mode 100644
index 30787c0a4..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <unordered_map>
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr size_t NOT_HIT = 0xFFFFFFFF;
-} // namespace
-
-NEHashtableLookupKernel::NEHashtableLookupKernel()
-    : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
-{
-}
-
-void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys,
-                                        const ITensor *input, ITensor *output, ITensor *hits)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
-
-  _lookups = lookups;
-  _keys = keys;
-  _input = input;
-  _output = output;
-  _hits = hits;
-
-  // Auto initialize output if not initialized
-  auto out_shape{input->info()->tensor_shape()};
-  out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false);
-  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
-                     input->info()->quantization_info());
-
-  // Auto initialize hits if not initialized
-  auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8);
-
-  INEKernel::configure(calculate_max_window(*output->info()));
-}
-
-Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
-                                         const ITensorInfo *input, const ITensorInfo *output,
-                                         const ITensorInfo *hits)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
-
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
-  ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1));
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
-    for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
-    {
-      ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
-    }
-  }
-
-  // Validate in case of configured hits
-  if (hits->total_size() > 0)
-  {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1));
-    ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0));
-    ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
-  }
-
-  return Status{};
-}
-
-void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-  const size_t lookup_dim = _output->info()->num_dimensions() - 1;
-  const int const_0 = _output->info()->data_type() == DataType::QASYMM8
-                          ? _output->info()->quantization_info().uniform().offset
-                          : 0;
-
-  std::unordered_map<int32_t, size_t> key_index_map;
-  for (size_t n = 0; n < _keys->info()->dimension(0); ++n)
-  {
-    const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n}));
-    key_index_map[key] = n;
-  }
-  std::vector<size_t> lookup_indices;
-  for (size_t k = 0; k < _lookups->info()->dimension(0); ++k)
-  {
-    const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k}));
-    const auto it = key_index_map.find(key);
-    if (it == key_index_map.end())
-    {
-      lookup_indices.emplace_back(NOT_HIT);
-      *_hits->ptr_to_element({k}) = 0;
-    }
-    else
-    {
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-      if (it->second >= _keys->info()->dimension(0))
-        ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds.");
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
-      lookup_indices.emplace_back(it->second);
-      *_hits->ptr_to_element({k}) = 1;
-    }
-  }
-
-  Window output_window{window};
-  output_window.set(Window::DimX,
-                    Window::Dimension(output_window.x().start(), output_window.x().end(),
-                                      _input->info()->dimension(0)));
-
-  Window out_slice = output_window.first_slice_window_4D();
-  do
-  {
-    Iterator output_it(_output, out_slice);
-
-    execute_window_loop(out_slice,
-                        [&](const Coordinates &id) {
-                          const auto lookup = lookup_indices.at(id[lookup_dim]);
-                          if (lookup == NOT_HIT)
-                          {
-                            memset(output_it.ptr(), const_0,
-                                   _output->info()->dimension(0) * _output->info()->element_size());
-                          }
-                          else
-                          {
-                            Coordinates input_id{id};
-                            input_id.set(lookup_dim, lookup);
-                            memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
-                                   _output->info()->dimension(0) * _output->info()->element_size());
-                          }
-
-                        },
-                        output_it);
-
-  } while (window.slide_window_slice_4D(out_slice));
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
deleted file mode 100644
index 49adf1462..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-template <typename T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
-                                 float epsilon, const Window &window)
-{
-  /** NEON vector tag type. */
-  using ExactTagType =
-      typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-  // Clear X/Y dimensions on execution window as we handle the planes manually
-  Window win = window;
-  win.set(Window::DimX, Window::Dimension(0, 1, 1));
-  win.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-  constexpr int window_step_x = 16 / sizeof(T);
-  const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
-  const auto channel_idx =
-      get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
-
-  Iterator input_it(input, win);
-  execute_window_loop(
-      win,
-      [&](const Coordinates &id) {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w = static_cast<T>(0.f);
-        auto sum_squares_h_w = static_cast<T>(0.f);
-
-        execute_window_loop(
-            win_plane,
-            [&](const Coordinates &) {
-              const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
-              auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-              auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-              // Compute S elements per iteration
-              int x = window.x().start();
-              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
-              {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
-                vec_sum_squares_h_w =
-                    wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
-              }
-
-              auto vec2_sum_h_w =
-                  wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-              auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
-                                                         wrapper::vgetlow(vec_sum_squares_h_w));
-              for (int i = 0; i < window_step_x / 4; ++i)
-              {
-                vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-                vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-              }
-              sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-              sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-              // Compute left-over elements
-              for (; x < window.x().end(); ++x)
-              {
-                const auto value = *(input_ptr + x);
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-              }
-            },
-            input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        auto gamma_val = 1.0f;
-        if (gamma != nullptr)
-        {
-          gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
-        }
-        const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
-        auto beta_val = 0.0f;
-        if (beta != nullptr)
-        {
-          beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
-        }
-        const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
-
-        execute_window_loop(
-            win_plane,
-            [&](const Coordinates &) {
-              auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
-              auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-              // Compute S elements per iteration
-              int x = window.x().start();
-              auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
-              {
-                vec_val = wrapper::vloadq(input_ptr + x);
-                vec_val = wrapper::vadd(
-                    wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
-                wrapper::vstore(output_ptr + x, vec_val);
-              }
-
-              // Compute left-over elements
-              for (; x < window.x().end(); ++x)
-              {
-                *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
-              }
-            },
-            input_plane_it, output_plane_it);
-      },
-      input_it);
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
-                                  "NHWC data layout is not supported by the kernel directly");
-
-  if (output != nullptr && output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
-                                    "Input and output have different number of channels");
-  }
-
-  if (gamma != nullptr)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
-                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
-                                        gamma->dimension(0),
-                                    "Gamma's size must be the same as size of input's channel");
-  }
-
-  if (beta != nullptr)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
-                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
-                                        beta->dimension(0),
-                                    "Beta's size must be the same as size of input's channel");
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  // We handle the planes manually
-  Window win = calculate_max_window(*input, Steps(1));
-
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
-
-  // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
-  // skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->num_dimensions());
-  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-  return std::make_pair(Status{}, win);
-}
-} // namespace
-
-NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
-      _epsilon(1e-12)
-{
-}
-
-void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output,
-                                                     ITensor *gamma, ITensor *beta, float epsilon)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _input = input;
-  _output = output == nullptr ? input : output;
-  _gamma = gamma;
-  _beta = beta;
-  _epsilon = epsilon;
-
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
-
-  if (_input->info()->data_type() == DataType::F32)
-  {
-    _func = &instance_normalization_nchw<float>;
-  }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-  else if (_input->info()->data_type() == DataType::F16)
-  {
-    _func = &instance_normalization_nchw<float16_t>;
-  }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-  else
-  {
-    ARM_COMPUTE_ERROR("Unsupported data type");
-  }
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(_input->info(), _output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
-                                                      const ITensorInfo *output,
-                                                      const ITensorInfo *gamma,
-                                                      const ITensorInfo *beta, float epsilon)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
-  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
-  return Status{};
-}
-
-void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-  (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
deleted file mode 100644
index b92130cec..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
-                          const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
-  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
-
-  // Checks performed when output is configured
-  if ((output->total_size() != 0))
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-
-inline int32x4x4_t load_value(const int32_t *input_ptr)
-{
-  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
-          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline const float32x4x4_t load_value(const float16_t *input_ptr)
-{
-  return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-          vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
-          vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
-}
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v)
-{
-  ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v)
-{
-  wrapper::vstore(ptr, v.val[0]);
-  wrapper::vstore(ptr + 4, v.val[1]);
-  wrapper::vstore(ptr + 8, v.val[2]);
-  wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
-{
-  wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
-  wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale)
-{
-  const float32x4_t vscale = vdupq_n_f32(scale);
-
-  const float32x4x4_t ret = {{
-      vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
-      vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
-  }};
-  return ret;
-}
-} // namespace
-
-NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
-{
-}
-
-void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor,
-                                            ITensor *output, float multiplier)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
-
-  _input = input;
-  _scale_factor = scale_factor;
-  _output = output;
-  _multiplier = multiplier;
-
-  // Configure kernel window
-  Window win_config = calculate_max_window(*input->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  INEKernel::configure(win_config);
-}
-
-Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
-                                             const ITensorInfo *scale_factor,
-                                             const ITensorInfo *output, float multiplier)
-{
-  ARM_COMPUTE_UNUSED(multiplier);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
-
-  return Status{};
-}
-
-template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window)
-{
-  constexpr auto window_step = 16;
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-
-  // Collapse window and reset first dimension to handle tail calculations manually
-  // Support Only 2D input
-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-  Iterator input(_input, win_collapsed);
-  Iterator output(_output, win_collapsed);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &id) {
-        auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
-        scale *= _multiplier;
-
-        const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
-        auto output_ptr = reinterpret_cast<T *>(output.ptr());
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step); x += window_step)
-        {
-          store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
-        }
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          output_ptr[x] = input_ptr[x] * scale;
-        }
-      },
-      input, output);
-}
-
-void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-  switch (_output->info()->data_type())
-  {
-    case DataType::F32:
-      NEMultiplyScaleFactorKernel::multiply<float>(window);
-      break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    case DataType::F16:
-      NEMultiplyScaleFactorKernel::multiply<float16_t>(window);
-      break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    default:
-      ARM_COMPUTE_ERROR("Unsupported data type.");
-  }
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
deleted file mode 100644
index 0a11eb509..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-namespace arm_compute
-{
-namespace
-{
-/** Validate the depth
- *
- * Validate that depth are not negative
- *
- * @param[in] depth Depth tensor.
- * @param[in] output Output tensor.
- * @param[in] axis Axis of depth.
- */
-template <typename U> void validate_depth(const ITensor *depth, const ITensor *output, int axis)
-{
-  ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(depth->buffer())) < 0);
-  ARM_COMPUTE_ERROR_ON(static_cast<U>(output->info()->tensor_shape()[axis]) !=
-                       *(reinterpret_cast<U *>(depth->buffer())));
-}
-
-Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *depth,
-                          const ITensorInfo *on_value, const ITensorInfo *off_value,
-                          const ITensorInfo *output, int axis)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output);
-  const int actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
-  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
-  ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(0 > actual_axis ||
-                              actual_axis >= static_cast<int>(output->num_dimensions()));
-  ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8,
-                                                       DataType::U16, DataType::S16, DataType::F16,
-                                                       DataType::U32, DataType::S32, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
-  }
-
-  return Status{};
-}
-
-template <typename U, typename Enable = void> bool isOnValue(U) { return true; }
-
-template <typename U, std::enable_if_t<std::is_integral<U>::value, int> = 0>
-bool isOnValue(U index, U depth)
-{
-  return index >= 0 && index < depth;
-}
-} // namespace
-
-NEOneHotKernel::NEOneHotKernel()
-    : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1},
-      _output{nullptr}, _func{}
-{
-}
-
-template <typename U>
-void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  // Validate that the depth are not negative
-  validate_depth<U>(_depth, _output, _axis);
-  Window output_window{window};
-  output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
-  Iterator output_it(_output, output_window);
-  const U off_value = *reinterpret_cast<U *>(_off_value->buffer());
-  execute_window_loop(
-      output_window,
-      [&](const Coordinates &id) {
-        std::fill_n(output_it.ptr(),
-                    _output->info()->dimension(0) * _output->info()->element_size(), off_value);
-        Coordinates indices_id(id);
-        indices_id.remove(0);
-        const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
-        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
-        {
-          Coordinates onehot_id(id);
-          onehot_id.set(0, new_index);
-          std::copy_n(_on_value->buffer(), _output->info()->element_size(),
-                      _output->ptr_to_element(onehot_id));
-        }
-      },
-      output_it);
-}
-
-template <typename U>
-inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  // Validate that the indices are not negative
-  validate_depth<U>(_depth, _output, _axis);
-  Iterator output_it(_output, window);
-  execute_window_loop(window,
-                      [&](const Coordinates &id) {
-                        Coordinates indices_id(id);
-                        indices_id.remove(_axis);
-                        const U new_index =
-                            *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
-                        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
-                        {
-                          Coordinates onehot_id(id);
-                          onehot_id.set(_axis, new_index);
-                          std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer()
-                                                                             : _off_value->buffer(),
-                                      _output->info()->element_size(), output_it.ptr());
-                        }
-                      },
-                      output_it);
-}
-
-void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth,
-                               const ITensor *on_value, const ITensor *off_value, ITensor *output,
-                               int axis)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output);
-  ARM_COMPUTE_ERROR_ON(output->info()->total_size() == 0);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(indices->info(), depth->info(), on_value->info(),
-                                                off_value->info(), output->info(), axis));
-  _indices = indices;
-  _depth = depth;
-  _on_value = on_value;
-  _off_value = off_value;
-  _output = output;
-  _axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions()));
-  if (0 == _axis)
-  {
-    switch (_indices->info()->data_type())
-    {
-      case DataType::U32:
-        _func = &NEOneHotKernel::onehot_0_axis<uint32_t>;
-        break;
-      case DataType::S32:
-        _func = &NEOneHotKernel::onehot_0_axis<int32_t>;
-        break;
-      default:
-        ARM_COMPUTE_ERROR("Not supported");
-        break;
-    }
-  }
-  else
-  {
-    switch (_indices->info()->data_type())
-    {
-      case DataType::U32:
-        _func = &NEOneHotKernel::onehot_n_axis<uint32_t>;
-        break;
-      case DataType::S32:
-        _func = &NEOneHotKernel::onehot_n_axis<int32_t>;
-        break;
-      default:
-        ARM_COMPUTE_ERROR("Not supported");
-        break;
-    }
-  }
-  // Create window
-  Window win = calculate_max_window(*output->info(), Steps());
-  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-  INEKernel::configure(win);
-}
-
-Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *depth,
-                                const ITensorInfo *on_value, const ITensorInfo *off_value,
-                                const ITensorInfo *output, int axis)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_arguments(indices, depth, on_value, off_value, output, axis));
-  return Status{};
-}
-
-void NEOneHotKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON(_func == nullptr);
-  (this->*_func)(window, info);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
deleted file mode 100644
index 5841f1d69..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *scale_factor)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
-
-  return Status{};
-}
-
-inline float32x4x4_t load_value(const float *input_ptr)
-{
-  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
-          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
-}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline const float32x4x4_t load_value(const float16_t *input_ptr)
-{
-  return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-          vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
-          vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
-}
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-inline float32x4_t round(const float32x4_t &fv)
-{
-  const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f);
-  const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f);
-  // If value < 0, mask = -1, else mask = 0
-  int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4));
-  return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4));
-}
-
-inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale)
-{
-  const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv);
-  const int32x4_t vposend = vdupq_n_s32(max_scale);
-  const int32x4_t vnagend = vdupq_n_s32(-max_scale);
-
-  const int32x4x4_t rf = {{
-#ifdef __aarch64__
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
-#else  //__aarch64__
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
-#endif //__aarch64__
-  }};
-  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-  return vcombine_s8(pa, pb);
-}
-} // namespace
-
-NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel()
-    : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
-{
-}
-
-void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output,
-                                              ITensor *scale_factor)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), output->info(), scale_factor->info()));
-
-  _input = input;
-  _output = output;
-  _scale_factor = scale_factor;
-
-  // Configure kernel window
-  Window win_config = calculate_max_window(*input->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  INEKernel::configure(win_config);
-}
-
-Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                               const ITensorInfo *scale_factor)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor));
-
-  return Status{};
-}
-
-template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window)
-{
-  constexpr auto window_step = 16;
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-
-#ifdef __aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP;
-#endif //__aarch64__
-
-  // Collapse window and reset first dimension to handle tail calculations manually
-  // Support Only 2D input
-  Window win_collapsed = window;
-  Iterator input(_input, win_collapsed);
-  Iterator output(_output, win_collapsed);
-  const auto dim_x = _input->info()->dimension(0);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &id) {
-        const auto start = reinterpret_cast<const T *>(input.ptr());
-        const auto min_max = std::minmax_element(start, start + dim_x);
-        const auto int8_scale = 127;
-        auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
-        if (range == 0)
-        {
-          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
-          range = 1;
-        }
-        else
-        {
-          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
-        }
-        const auto scale_factor_inv = int8_scale / range;
-
-        auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step); x += window_step)
-        {
-          wrapper::vstore(&output_ptr[x],
-                          vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
-        }
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
-          quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
-          output_ptr[x] = static_cast<int8_t>(quantized);
-        }
-      },
-      input, output);
-}
-
-void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-  switch (_input->info()->data_type())
-  {
-    case DataType::F32:
-      NEQuantizationSymmetricKernel::quantize<float>(window);
-      break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    case DataType::F16:
-      NEQuantizationSymmetricKernel::quantize<float16_t>(window);
-      break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    default:
-      ARM_COMPUTE_ERROR("Unsupported data type.");
-  }
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
deleted file mode 100644
index 3b65eac10..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
+++ /dev/null
@@ -1,693 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-// Helper function to calculate the minimum value of the input vector. All the elements in the
-// output vector contain the min value.
-float32x2_t calculate_min(float32x4_t in)
-{
-  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
-  return wrapper::vpmin(pmin, pmin);
-}
-
-// Helper function to calculate the maximum value of the input vector. All the elements in the
-// output vector contain the max value.
-float32x2_t calculate_max(float32x4_t in)
-{
-  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
-  return wrapper::vpmax(pmax, pmax);
-}
-// Helper function to calculate the minimum value of the input vector. All the elements in the
-// output vector contain the min value.
-int32x2_t calculate_min(int32x4_t in)
-{
-  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
-  return wrapper::vpmin(pmin, pmin);
-}
-
-// Helper function to calculate the maximum value of the input vector. All the elements in the
-// output vector contain the max value.
-int32x2_t calculate_max(int32x4_t in)
-{
-  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
-  return wrapper::vpmax(pmax, pmax);
-}
-
-// Helper function to calculate the minimum value of the input vector. All the elements in the
-// output vector contain the min value.
-inline uint8x8_t calculate_min(uint8x16_t in)
-{
-  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
-  pmin = wrapper::vpmin(pmin, pmin);
-  pmin = wrapper::vpmin(pmin, pmin);
-  return wrapper::vpmin(pmin, pmin);
-}
-// Helper function to calculate the maximum value of the input vector. All the elements in the
-// output vector contain the max value.
-inline uint8x8_t calculate_max(uint8x16_t in)
-{
-  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
-  pmax = wrapper::vpmax(pmax, pmax);
-  pmax = wrapper::vpmax(pmax, pmax);
-  return wrapper::vpmax(pmax, pmax);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-// Helper function to calculate the minimum value of the input vector. All the elements in the
-// output vector contain the min value.
-inline float16x4_t calculate_min(float16x8_t in)
-{
-  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
-  pmin = wrapper::vpmin(pmin, pmin);
-  return wrapper::vpmin(pmin, pmin);
-}
-// Helper function to calculate the maximum value of the input vector. All the elements in the
-// output vector contain the max value.
-inline float16x4_t calculate_max(float16x8_t in)
-{
-  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
-  pmax = wrapper::vpmax(pmax, pmax);
-  return wrapper::vpmax(pmax, pmax);
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <class F> class Reducer
-{
-public:
-  static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f,
-                      const ReduceOperation op)
-  {
-    // Set out window
-    Window out_window(window);
-    out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    // Get first input and output slices
-    Window in_slice = window.first_slice_window_1D();
-    Window out_slice = out_window.first_slice_window_1D();
-
-    do
-    {
-      Iterator in(input, in_slice);
-      Iterator out(output, out_slice);
-
-      f(in, out, in_slice, out_slice, *input->info(), op);
-    } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
-  }
-  static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f,
-                      const ReduceOperation op)
-  {
-    // Set in window
-    Window in_window(window);
-    Window out_window(window);
-
-    in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
-    out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1),
-                                                   output->info()->dimension(1)));
-
-    // Get first input and output slices
-    Window in_slice = in_window.first_slice_window_2D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    do
-    {
-      Iterator in(input, in_slice);
-      Iterator out(output, out_slice);
-
-      f(in, out, in_slice, out_slice, *input->info(), 1, op);
-    } while (in_window.slide_window_slice_2D(in_slice) &&
-             out_window.slide_window_slice_2D(out_slice));
-  }
-  static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f,
-                      const ReduceOperation op)
-  {
-    // Set in window
-    Window in_window(window);
-    Window out_window(window);
-
-    in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
-    out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2),
-                                                   output->info()->dimension(2)));
-
-    // Get first input and output slices
-    Window in_slice = in_window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_3D();
-
-    do
-    {
-      Iterator in(input, in_slice);
-      Iterator out(output, out_slice);
-
-      f(in, out, in_slice, out_slice, *input->info(), 2, op);
-    } while (in_window.slide_window_slice_3D(in_slice) &&
-             out_window.slide_window_slice_3D(out_slice));
-  }
-  static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f,
-                      const ReduceOperation op)
-  {
-    // Set in/out window
-    Window in_window(window);
-    Window out_window(window);
-
-    in_window.set(3, Window::Dimension(0, 1, 1));
-    out_window.set(3, Window::Dimension(0, 1, 1));
-
-    // Get first input and output slices
-    Window in_slice = in_window.first_slice_window_4D();
-    Window out_slice = out_window.first_slice_window_4D();
-
-    do
-    {
-      Iterator in(input, in_slice);
-      Iterator out(output, out_slice);
-
-      f(in, out, in_slice, out_slice, *input->info(), 3, op);
-    } while (in_window.slide_window_slice_4D(in_slice) &&
-             out_window.slide_window_slice_4D(out_slice));
-  }
-};
-
-template <typename T, int S> struct RedOpX
-{
-  /** NEON vector tag type. */
-  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
-                         const TensorInfo &in_info, const ReduceOperation op)
-  {
-    ARM_COMPUTE_UNUSED(out_slice);
-    ARM_COMPUTE_UNUSED(in_info);
-    auto init_res_value = static_cast<T>(0.f);
-    switch (op)
-    {
-      case ReduceOperation::MIN:
-      case ReduceOperation::MAX:
-      {
-        init_res_value = *reinterpret_cast<T *>(input.ptr());
-        break;
-      }
-      default:
-        break;
-    }
-    auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
-
-    execute_window_loop(in_slice,
-                        [&](const Coordinates &) {
-                          const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
-                          const auto vec_elements = wrapper::vloadq(in_ptr);
-
-                          switch (op)
-                          {
-                            case ReduceOperation::MIN:
-                            {
-                              vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                              break;
-                            }
-                            case ReduceOperation::MAX:
-                            {
-                              vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                              break;
-                            }
-                            default:
-                              ARM_COMPUTE_ERROR("Not supported");
-                          }
-                        },
-                        input);
-
-    switch (op)
-    {
-      case ReduceOperation::MIN:
-      {
-        *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0);
-        break;
-      }
-      case ReduceOperation::MAX:
-      {
-        *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0);
-        break;
-      }
-      default:
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-  }
-};
-
-struct RedOpX_qasymm8
-{
-  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
-                         const TensorInfo &in_info, const ReduceOperation op)
-  {
-    ARM_COMPUTE_UNUSED(out_slice);
-    ARM_COMPUTE_UNUSED(in_info);
-
-    uint8x16_t vec_res_value = {0};
-
-    if (op == ReduceOperation::MIN || op == ReduceOperation::MAX)
-    {
-      vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
-    }
-
-    execute_window_loop(in_slice,
-                        [&](const Coordinates &) {
-                          const auto vec_elements = wrapper::vloadq(input.ptr());
-                          switch (op)
-                          {
-                            case ReduceOperation::MIN:
-                            {
-                              vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                              break;
-                            }
-                            case ReduceOperation::MAX:
-                            {
-                              vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                              break;
-                            }
-                            default:
-                              ARM_COMPUTE_ERROR("Not supported");
-                          }
-                        },
-                        input);
-
-    switch (op)
-    {
-      case ReduceOperation::MIN:
-      {
-        *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-        break;
-      }
-      case ReduceOperation::MAX:
-      {
-        *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-        break;
-      }
-      default:
-      {
-        ARM_COMPUTE_ERROR("Not supported");
-      }
-    }
-  }
-};
-
-template <typename T, int S> struct RedOpYZW
-{
-  /** NEON vector tag type. */
-  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-  using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
-
-  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
-                         const TensorInfo &in_info, int axis, const ReduceOperation op)
-  {
-    ARM_COMPUTE_UNUSED(out_slice);
-
-    execute_window_loop(
-        in_slice,
-        [&](const Coordinates &) {
-          neon_vector vec_res_value = {0};
-          switch (op)
-          {
-            case ReduceOperation::MIN:
-            case ReduceOperation::MAX:
-            {
-              vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
-              break;
-            }
-            default:
-            {
-              vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-              break;
-            }
-          }
-
-          for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
-          {
-            T *in_ptr;
-            switch (axis)
-            {
-              case 1:
-                in_ptr = reinterpret_cast<T *>(
-                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
-                break;
-              case 2:
-                in_ptr = reinterpret_cast<T *>(
-                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
-                break;
-              case 3:
-                in_ptr = reinterpret_cast<T *>(
-                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
-                break;
-              default:
-                ARM_COMPUTE_ERROR("Not supported");
-            }
-            const auto vec_elements = wrapper::vloadq(in_ptr);
-
-            switch (op)
-            {
-              case ReduceOperation::MIN:
-              {
-                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                break;
-              }
-              case ReduceOperation::MAX:
-              {
-                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                break;
-              }
-              default:
-                ARM_COMPUTE_ERROR("Not supported");
-            }
-          }
-          wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
-        },
-        input, output);
-  }
-};
-
-struct RedOpYZW_qasymm8
-{
-  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
-                         const TensorInfo &in_info, int axis, const ReduceOperation op)
-  {
-    ARM_COMPUTE_UNUSED(out_slice);
-
-    execute_window_loop(
-        in_slice,
-        [&](const Coordinates &) {
-          auto vec_res_value = wrapper::vloadq(input.ptr());
-
-          for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
-          {
-            uint8_t *in_ptr;
-            switch (axis)
-            {
-              case 1:
-                in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
-                break;
-              case 2:
-                in_ptr =
-                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
-                break;
-              case 3:
-                in_ptr =
-                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
-                break;
-              default:
-                ARM_COMPUTE_ERROR("Not supported");
-            }
-            const auto vec_elements = wrapper::vloadq(in_ptr);
-
-            switch (op)
-            {
-              case ReduceOperation::MIN:
-              {
-                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                break;
-              }
-              case ReduceOperation::MAX:
-              {
-                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                break;
-              }
-              default:
-                ARM_COMPUTE_ERROR("Not supported");
-            }
-          }
-          wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value);
-        },
-        input, output);
-  }
-};
-
-void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis,
-               const ReduceOperation op)
-{
-  const bool is_complex = (input->info()->num_channels() == 2);
-  if (is_complex)
-  {
-    ARM_COMPUTE_ERROR("Not supported");
-  }
-
-  switch (axis)
-  {
-    case 0:
-      switch (input->info()->data_type())
-      {
-        case DataType::QASYMM8:
-          return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-          return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output,
-                                                        RedOpX<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-          return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
-        case DataType::S32:
-          return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(),
-                                                      op);
-        default:
-          ARM_COMPUTE_ERROR("Not supported");
-      }
-    case 1:
-      switch (input->info()->data_type())
-      {
-        case DataType::QASYMM8:
-          return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-          return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output,
-                                                          RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-          return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(),
-                                                      op);
-        case DataType::S32:
-          return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output,
-                                                        RedOpYZW<int32_t, 4>(), op);
-        default:
-          ARM_COMPUTE_ERROR("Not supported");
-      }
-    case 2:
-      switch (input->info()->data_type())
-      {
-        case DataType::QASYMM8:
-          return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-          return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output,
-                                                          RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-          return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(),
-                                                      op);
-        case DataType::S32:
-          return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output,
-                                                        RedOpYZW<int32_t, 4>(), op);
-        default:
-          ARM_COMPUTE_ERROR("Not supported");
-      }
-    case 3:
-      switch (input->info()->data_type())
-      {
-        case DataType::QASYMM8:
-          return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-          return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output,
-                                                          RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-          return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(),
-                                                      op);
-        case DataType::S32:
-          return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output,
-                                                        RedOpYZW<int32_t, 4>(), op);
-        default:
-          ARM_COMPUTE_ERROR("Not supported");
-      }
-    default:
-      ARM_COMPUTE_ERROR("Unsupported reduction axis");
-  }
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
-                          ReduceOperation op)
-{
-  ARM_COMPUTE_UNUSED(op);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-
-  if (input->num_channels() == 1)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
-                                                         DataType::F16, DataType::F32);
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex");
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
-                                  "Reduction axis greater than max number of dimensions");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
-
-    const TensorShape output_shape =
-        arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
-    const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                         unsigned int axis, ReduceOperation op)
-{
-  ARM_COMPUTE_UNUSED(op);
-
-  // Calculate output shape and set if empty
-  const TensorShape output_shape =
-      arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
-
-  // Output auto initialization if not yet initialized
-  DataType output_data_type = input->data_type();
-  auto_init_if_empty(*output, input->clone()
-                                  ->set_tensor_shape(output_shape)
-                                  .set_data_type(output_data_type)
-                                  .reset_padding()
-                                  .set_is_resizable(true));
-
-  unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-
-  return std::make_tuple(err, win);
-}
-} // namespace
-
-NEReductionOperationKernelEx::NEReductionOperationKernelEx()
-    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX),
-      _border_size()
-{
-}
-
-BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; }
-
-void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output,
-                                             unsigned int axis, ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
-  unsigned int num_elems_processed_per_iteration =
-      16 / data_size_from_type(input->info()->data_type());
-
-  _input = input;
-  _output = output;
-  _border_size =
-      (axis == 0)
-          ? BorderSize(0, num_elems_processed_per_iteration -
-                              (input->info()->dimension(0) % num_elems_processed_per_iteration),
-                       0, 0)
-          : BorderSize();
-  _op = op;
-  _reduction_axis = axis;
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
-
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                              unsigned int axis, ReduceOperation op)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
-      validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
-
-  return Status{};
-}
-
-void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-  reduce_op(window, _input, _output, _reduction_axis, _op);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp
deleted file mode 100644
index 863316909..000000000
--- a/compute/ARMComputeEx/src/core/UtilsEx.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/UtilsEx.h"
-#include "arm_compute/core/Error.h"
-
-using namespace arm_compute;
-
-const std::pair<unsigned int, unsigned int>
-arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                             unsigned int kernel_width, unsigned int kernel_height,
-                                             const PadStrideInfo &info, unsigned int invalid_right,
-                                             unsigned int invalid_bottom)
-{
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
-  const unsigned int padx = info.pad_left() + info.pad_right();
-  const unsigned int pady = info.pad_top() + info.pad_bottom();
-
-  ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
-  ARM_COMPUTE_ERROR_ON(kernel_width <= padx);
-  ARM_COMPUTE_ERROR_ON(kernel_height <= pady);
-
-  // Find the transpose conv out dimensions
-  // transpose conv out:
-  //    tconv_out + pad = 1 + (in - 1) * stride + invalid
-  //    tconv_out = 1 + (in - 1) * stride + invalid - pad
-  const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right;
-  const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom;
-
-  return std::make_pair<unsigned int, unsigned int>(w, h);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
deleted file mode 100644
index 158fe0b0c..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "arm_compute/runtime/CL/CLFunctionsEx.h"
-
-// NOTE This empty file aims to validate "CLFunctionsEx.h".
-//      DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
deleted file mode 100644
index 267228eac..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/Utils.h"
-
-namespace arm_compute
-{
-CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
-      _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
-{
-}
-
-Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
-                                    const ReductionOperation &op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
-                                      op != ReductionOperation::ARG_IDX_MIN,
-                                  "Invalid reduction operation");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
-                                  "Reduction axis greater than max number of dimensions");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-  const unsigned int num_of_stages =
-      calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
-
-  DataType output_data_type = DataType::S32;
-  TensorInfo not_reshaped_output;
-  const auto input_num_channles = input->num_channels();
-  const auto input_qinfo = input->quantization_info();
-
-  if (output->total_size() != 0)
-  {
-    output_data_type = output->data_type();
-    const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
-        arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis,
-                                                                   false));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
-  }
-
-  auto shape_before_reshape = input->tensor_shape();
-  shape_before_reshape.set(axis, 1);
-  auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type,
-                                  int num_channels, QuantizationInfo qinfo) {
-    ti.set_data_type(data_type)
-        .set_tensor_shape(shape)
-        .set_num_channels(num_channels)
-        .set_quantization_info(qinfo);
-  };
-
-  initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type,
-                        input_num_channles, input_qinfo);
-
-  if (num_of_stages == 1)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
-  }
-  else
-  {
-    // Create temporary tensor infos
-    std::vector<TensorInfo> sums_vector(num_of_stages - 1);
-
-    // Create intermediate tensor info
-    TensorShape shape{input->tensor_shape()};
-
-    for (unsigned int i = 0; i < num_of_stages - 1; i++)
-    {
-      shape.set(0, ceil(shape.x() / 128.f));
-      sums_vector[i].set_data_type(input->data_type());
-      sums_vector[i].set_tensor_shape(shape);
-      sums_vector[i].set_num_channels(input->num_channels());
-    }
-
-    // Validate ReductionOperation only on first kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
-
-    // Validate ReductionOperation on intermediate stages
-    for (unsigned int i = 1; i < num_of_stages - 1; ++i)
-    {
-      ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1],
-                                                                     &sums_vector[i], axis, op));
-    }
-
-    // Validate ReductionOperation on the last stage
-    const unsigned int last_stage = num_of_stages - 1;
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
-        input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
-  }
-  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
-  return Status{};
-}
-
-void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output,
-                                   const ReductionOperation &op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
-  _reduction_axis = axis;
-
-  const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
-      input->info()->tensor_shape(), axis, false);
-  DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN)
-                                  ? DataType::S32
-                                  : output->info()->data_type();
-  auto_init_if_empty(*output->info(), input->info()
-                                          ->clone()
-                                          ->set_tensor_shape(output_shape)
-                                          .set_data_type(output_data_type)
-                                          .reset_padding()
-                                          .set_is_resizable(true));
-
-  // Configure reduction operation kernels
-  _reduction_kernels_vector.resize(_num_of_stages);
-
-  _memory_group.manage(&_not_reshaped_output);
-  // Create temporary tensors
-  if (_num_of_stages == 1)
-  {
-    // Force an early initialization for int64 output type
-    TensorShape output_shape{input->info()->tensor_shape()};
-    output_shape.set(axis, 1);
-    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
-                                                         ->clone()
-                                                         ->set_tensor_shape(output_shape)
-                                                         .set_data_type(output_data_type)
-                                                         .reset_padding()
-                                                         .set_is_resizable(true));
-    _not_reshaped_output.info()->set_tensor_shape(output_shape);
-    _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
-  }
-  else
-  {
-    _results_vector.resize(_num_of_stages - 1);
-    TensorShape shape{input->info()->tensor_shape()};
-    for (unsigned int i = 0; i < _num_of_stages - 1; i++)
-    {
-      shape.set(0, ceil(shape.x() / 128.f));
-      _results_vector[i].allocator()->init(
-          input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
-    }
-
-    // Apply ReductionOperation only on first kernel
-    _memory_group.manage(&_results_vector[0]);
-    _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op);
-
-    // Apply ReductionOperation on intermediate stages
-    for (unsigned int i = 1; i < _num_of_stages - 1; ++i)
-    {
-      _memory_group.manage(&_results_vector[i]);
-      _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i],
-                                             axis, op);
-      _results_vector[i - 1].allocator()->allocate();
-    }
-
-    // Apply ReductionOperation on the last stage
-    const unsigned int last_stage = _num_of_stages - 1;
-    _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1],
-                                                    &_not_reshaped_output, axis, op);
-    _results_vector[last_stage - 1].allocator()->allocate();
-  }
-  _reshape_kernel.configure(&_not_reshaped_output, output);
-  _not_reshaped_output.allocator()->allocate();
-}
-
-void CLArgMinMaxLayerEx::run()
-{
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  for (unsigned int i = 0; i < _num_of_stages; ++i)
-  {
-    CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
-  }
-  CLScheduler::get().enqueue(_reshape_kernel, false);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
deleted file mode 100644
index e5122ab8f..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
-
-#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
-                                  BinaryLogicalOperation op)
-{
-  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
-  k->configure(input1, input2, output, op);
-  _kernel = std::move(k);
-
-  if (output->info()->dimension(0) > 1)
-  {
-    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-    if (broadcasted_info->info()->dimension(0) == 1)
-    {
-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-    }
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
deleted file mode 100644
index c7d0ac8e2..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLCastBool.h"
-
-#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h"
-
-using namespace arm_compute;
-
-void CLCastBool::configure(ICLTensor *input, ICLTensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
deleted file mode 100644
index 3dede0562..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/UtilsEx.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <memory>
-#include <tuple>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
-    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _scale_f(),
-      _conv_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _original_weights(nullptr),
-      _weights_flipped(),
-      _flip_axis(),
-      _is_prepared(false)
-{
-}
-
-Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                                            const ITensorInfo *bias, ITensorInfo *output,
-                                            const PadStrideInfo &info, unsigned int invalid_right,
-                                            unsigned int invalid_bottom,
-                                            const WeightsInfo &weights_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-  const DataLayout data_layout = input->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-
-  auto out_dims = transposeconv_output_dimensions(
-      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
-      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
-
-  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
-  if (bias != nullptr)
-  {
-    if (is_data_type_quantized_asymmetric(input->data_type()))
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-    }
-    else
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
-                                  "Output's width is invalid.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
-                                  "Output's height is invalid.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
-                                  "Output's depth is invalid.");
-
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
-  TensorInfo scale_out_info(input->clone()
-                                ->set_is_resizable(true)
-                                .reset_padding()
-                                .set_tensor_shape(scale_out_shape)
-                                .set_data_layout(data_layout));
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-  ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, weights_info));
-
-  return Status{};
-}
-
-void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights,
-                                           const ICLTensor *bias, ICLTensor *output,
-                                           const PadStrideInfo &info, unsigned int invalid_right,
-                                           unsigned int invalid_bottom,
-                                           const WeightsInfo &weights_info)
-{
-  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info,
-            invalid_right, invalid_bottom, weights_info);
-}
-
-void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context,
-                                           ICLTensor *input, ICLTensor *weights,
-                                           const ICLTensor *bias, ICLTensor *output,
-                                           const PadStrideInfo &info, unsigned int invalid_right,
-                                           unsigned int invalid_bottom,
-                                           const WeightsInfo &weights_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
-
-  const DataLayout data_layout = input->info()->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-  _original_weights = weights;
-  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
-  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
-
-  auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
-      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
-      invalid_bottom);
-
-  const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
-
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty(
-      *output->info(),
-      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
-
-  _is_prepared = weights_info.retain_internal_weights();
-
-  _memory_group.manage(&_scaled_output);
-
-  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
-  // to match output shape
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
-
-  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
-                            input->info()->quantization_info());
-  scale_out_info.set_data_layout(data_layout);
-  _scaled_output.allocator()->init(scale_out_info);
-
-  // configure scale function
-  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                    DimensionRoundingType::FLOOR);
-  _scale_f.configure(input, &_scaled_output, upsample_info);
-
-  // Setup the function to convolve the upscaled output
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-  _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info,
-                    weights_info);
-  _scaled_output.allocator()->allocate();
-
-  // Setup flip axis data
-  _flip_axis.allocator()->allocate();
-  _flip_axis.map(true);
-  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
-  if (weights->info()->data_layout() == DataLayout::NHWC)
-  {
-    axis_data[0] = 1;
-    axis_data[1] = 2;
-  }
-  else
-  {
-    axis_data[0] = 0;
-    axis_data[1] = 1;
-  }
-  _flip_axis.unmap();
-}
-
-void CLDirectTransposeConvLayer::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  _scale_f.run();
-  _conv_f.run();
-}
-
-void CLDirectTransposeConvLayer::prepare()
-{
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    // Run weights flipping and mark original weights tensor as unused
-    _weights_flipped.allocator()->allocate();
-    _flip_weights.run();
-    _original_weights->mark_as_unused();
-
-    // Prepare convolution
-    _conv_f.prepare();
-
-    // Free flipped weights
-    if (!_weights_flipped.is_used())
-    {
-      _weights_flipped.allocator()->free();
-    }
-
-    _is_prepared = true;
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
deleted file mode 100644
index ae9d8afc6..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
-
-#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
-
-using namespace arm_compute;
-
-void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
-                                  const ICLTensor *lookups)
-{
-  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
-  k->configure(input, output, lookups);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
deleted file mode 100644
index 01989461e..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h"
-
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
-{
-  ARM_COMPUTE_UNUSED(input);
-  ARM_COMPUTE_UNUSED(weights);
-  ARM_COMPUTE_UNUSED(output);
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
-
-  return Status{};
-}
-} // namespace
-
-void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
-{
-  auto k = support::cpp14::make_unique<CLTransposeKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
-
-Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
-                                                           const ITensorInfo *output)
-{
-  return CLTransposeKernel::validate(input, output);
-}
-
-CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
-      _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
-      _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
-      _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
-      _original_weights(nullptr)
-{
-}
-void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights,
-                                               ICLTensor *output, bool retain_internal_weights)
-{
-  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-  ARM_COMPUTE_UNUSED(output);
-  ARM_COMPUTE_UNUSED(retain_internal_weights);
-  // Configure gemmlowp function
-  _mm_gemmlowp.configure(input, weights, nullptr, output);
-}
-
-void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTensor *weights,
-                                            const ICLTensor *biases, ICLTensor *output,
-                                            FullyConnectedLayerInfo fc_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  // Perform validate step
-  ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
-
-  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-  _accumulate_biases = false;
-  _is_prepared = fc_info.retain_internal_weights;
-  _original_weights = weights;
-
-  // Configure accumulate biases kernel for non quantized asymmetric types
-  if (biases != nullptr)
-  {
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-
-    _accumulate_biases = true;
-
-    // Configure accumulate biases kernel
-    _accumulate_biases_kernel.set_target(CLScheduler::get().target());
-    _accumulate_biases_kernel.configure(output, biases);
-  }
-
-  const ICLTensor *weights_to_use = weights;
-
-  // With the Fully Connected layer we can have 4 different cases:
-  //  1) Convolution layer -> Fully Connected layer without batches
-  //  2) Fully Connected layer -> Fully Connected layer without batches
-  //  3) Convolution layer -> Fully Connected layer with batches
-  //  4) Fully Connected layer -> Fully Connected layer with batches
-
-  // Check if we have a fully connected layer with batches
-  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-  bool is_fc_after_conv = false;
-  if (is_batched_fc_layer)
-  {
-    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                       (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                   input->info()->tensor_shape().cend(),
-                                   output->info()->tensor_shape().cbegin() + 1));
-  }
-  else
-  {
-    is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
-  }
-  ARM_COMPUTE_ERROR_ON_MSG(is_fc_after_conv,
-                           "CLFullyConnectedHybridLayer does not support after conv");
-  ARM_COMPUTE_UNUSED(is_fc_after_conv);
-
-  // Reshape weights if needed
-  if (!_are_weights_reshaped)
-  {
-    // Reshape the weights
-    _reshape_weights_output.allocator()->init(
-        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-            compute_transposed_shape(*weights->info())));
-    _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output);
-    weights_to_use = &_reshape_weights_output;
-  }
-
-  // Extract scale factor
-  _scale_factor.allocator()->init(
-      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
-  _memory_group.manage(&_scale_factor);
-  _scale_factor_kernel.configure(input, &_scale_factor);
-
-  // Quantize input
-  _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
-  _memory_group.manage(&_quantized_input);
-  _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
-
-  // GEMMLowp
-  _gemmlowp_output.allocator()->init(
-      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-  _memory_group.manage(&_gemmlowp_output);
-  configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output,
-               fc_info.retain_internal_weights);
-  _quantized_input.allocator()->allocate();
-
-  // Multiply scale
-  _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
-                                   weights->info()->quantization_info().uniform().scale);
-  _gemmlowp_output.allocator()->allocate();
-  _scale_factor.allocator()->allocate();
-
-  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
-}
-
-Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                                             const ITensorInfo *biases, const ITensorInfo *output,
-                                             FullyConnectedLayerInfo fc_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-
-  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-  bool is_fc_after_conv = true;
-  const GPUTarget gpu_target = CLScheduler::get().target();
-
-  const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
-
-  // Configure accumulate biases kernel for non quantized asymmetric types
-  if (biases != nullptr)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
-  }
-
-  // With the Fully Connected layer we can have 4 different cases:
-  //  1) Convolution layer -> Fully Connected layer without batches
-  //  2) Fully Connected layer -> Fully Connected layer without batches
-  //  3) Convolution layer -> Fully Connected layer with batches
-  //  4) Fully Connected layer -> Fully Connected layer with batches
-
-  const ITensorInfo *weights_to_use = weights;
-
-  // Check if we have a fully connected layer with batches
-  const bool is_batched_fc_layer = output->dimension(1) > 1;
-  if (is_batched_fc_layer)
-  {
-    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
-                                   output->tensor_shape().cbegin() + 1));
-  }
-  else
-  {
-    is_fc_after_conv = input->num_dimensions() > 1 && input->dimension(1) > 1;
-  }
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_fc_after_conv,
-                                  "CLFullyConnectedHybridLayer does not support after conv");
-
-  if (!weights_reshaped)
-  {
-    // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
-    weights_to_use = &reshaped_weights;
-  }
-
-  // Validate Scale factor kernel
-  const ITensorInfo &scale_factor =
-      TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
-  ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
-
-  // Validate quantization symm8 kernel
-  const ITensorInfo &quantized_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
-
-  // Fully Connected layer after a Fully Connected Layer without batches
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
-
-  // Validate matrix multiply kernel
-  const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
-
-  // Multiply scale
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
-
-  return Status{};
-}
-
-void CLFullyConnectedHybridLayer::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Extract scale_factor
-  CLScheduler::get().enqueue(_scale_factor_kernel);
-
-  // Quantize input
-  CLScheduler::get().enqueue(_quant_input_kernel);
-
-  // Run matrix multiply
-  _mm_gemmlowp.run();
-
-  // Multiply scale factor
-  CLScheduler::get().enqueue(_multiply_scale_kernel);
-
-  // Accumulate biases if provided
-  if (_accumulate_biases)
-  {
-    CLScheduler::get().enqueue(_accumulate_biases_kernel);
-  }
-}
-
-void CLFullyConnectedHybridLayer::prepare()
-{
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    auto release_unused = [](CLTensor *w) {
-      if (!w->is_used())
-      {
-        CLScheduler::get().queue().finish();
-        w->allocator()->free();
-      }
-    };
-
-    // Reshape of the weights if needed (happens only once)
-    if (!_are_weights_reshaped)
-    {
-      // Run reshape weights kernel and mark weights as unused
-      _reshape_weights_output.allocator()->allocate();
-      _reshape_weights_kernel.run();
-
-      _are_weights_reshaped = true;
-      // We can not release _original_weights because it can be used in other nodes
-    }
-
-    // Prepare GEMM prepare and release unused weights
-    _mm_gemmlowp.prepare();
-
-    // Release reshaped weights if unused
-    release_unused(&_reshape_weights_output);
-
-    _is_prepared = true;
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
deleted file mode 100644
index 2ff4b9659..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h"
-
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
-
-#include <algorithm>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::utils::cast;
-
-namespace
-{
-Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights,
-                                       const ITensorInfo &output,
-                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage)
-{
-  gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-  gemmlowp_output_stage.gemmlowp_offset = 0;
-  gemmlowp_output_stage.gemmlowp_multiplier = 0;
-  gemmlowp_output_stage.gemmlowp_shift = 0;
-
-  // Configure output stage for quantized case
-  if (is_data_type_quantized_asymmetric(input.data_type()))
-  {
-    const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
-    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = output.quantization_info().uniform();
-
-    const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info;
-
-    const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
-    int output_multiplier = 0;
-    int output_shift = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(
-        multiplier, &output_multiplier, &output_shift));
-
-    // Set the GEMMLowp output stage info
-    gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
-    gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
-    gemmlowp_output_stage.gemmlowp_shift = output_shift;
-    gemmlowp_output_stage.gemmlowp_min_bound = 0;
-    gemmlowp_output_stage.gemmlowp_max_bound = 255;
-    gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
-    gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
-  }
-
-  return Status{};
-}
-
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias,
-                   const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
-{
-  GEMMLowpOutputStageInfo gemmlowp_output_stage;
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
-
-  const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
-                                       false, // is_b_reshaped
-                                       true,  // reshape_b_only_on_first_run
-                                       0,     // depth_output_gemm3d
-                                       false, // reinterpret_input_as_3d
-                                       fc_info.retain_internal_weights, // retain_internal_weights
-                                       gemmlowp_output_stage,           // gemmlowp_output_stage
-                                       fc_info.fp_mixed_precision,      // fp_mixed_precision
-                                       true,                            // broadcast_bias
-                                       ActivationLayerInfo());          // activation_info
-
-  if (is_data_type_quantized_asymmetric(input.data_type()))
-  {
-    const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
-    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
-
-    // Since we need negative offsets for computing convolution, we need to change
-    // QuantizationInfo()
-    // Extract and negate input and weights offset
-    const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset);
-    const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
-
-    // Validate gemmlowp function
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
-        &input.clone()->set_quantization_info(input_quantization_info),
-        &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
-        gemm_info));
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
-  }
-
-  return Status{};
-}
-} // namespace
-
-void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
-{
-  auto k = support::cpp14::make_unique<CLTransposeKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
-
-Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input,
-                                                       const ITensorInfo *output)
-{
-  return CLTransposeKernel::validate(input, output);
-}
-
-CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager,
-                                                 IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
-      _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
-      _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
-      _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
-      _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
-      _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
-{
-}
-void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights,
-                                           const ICLTensor *bias, ICLTensor *output,
-                                           const FullyConnectedLayerInfo &fc_info)
-{
-  GEMMLowpOutputStageInfo gemmlowp_output_stage;
-  construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(),
-                                  gemmlowp_output_stage);
-
-  const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
-                                       false, // is_b_reshaped
-                                       true,  // reshape_b_only_on_first_run
-                                       0,     // depth_output_gemm3d
-                                       false, // reinterpret_input_as_3d
-                                       fc_info.retain_internal_weights, // retain_internal_weights
-                                       gemmlowp_output_stage,           // gemmlowp_output_stage
-                                       fc_info.fp_mixed_precision,      // fp_mixed_precision
-                                       true,                            // broadcast_bias
-                                       ActivationLayerInfo());          // activation_info
-
-  if (_is_quantized)
-  {
-    // Since we need negative offsets for computing convolution, we need to change
-    // QuantizationInfo()
-    // Extract and negate input and weights offset
-    const QuantizationInfo input_quantization_info = input->info()->quantization_info();
-    const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
-    input->info()->set_quantization_info(QuantizationInfo(
-        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-    weights->info()->set_quantization_info(QuantizationInfo(
-        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
-    // Configure gemmlowp function
-    _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
-
-    // Revert back QuantizatioInfo as input and weights could be used in other fully connected
-    // layers
-    input->info()->set_quantization_info(input_quantization_info);
-    weights->info()->set_quantization_info(weights_quantization_info);
-  }
-  else
-  {
-    // Configure matrix multiply kernel
-    _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info);
-  }
-}
-
-void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights,
-                                                const ICLTensor *bias, ICLTensor *output,
-                                                const FullyConnectedLayerInfo &fc_info)
-{
-  ARM_COMPUTE_ERROR_ON(
-      (weights->info()->dimension(1) !=
-       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-  // If the fully connected layer is called after a convolution layer, the input tensor must be
-  // linearized
-
-  // Initialize output tensor for flatten
-  TensorShape shape_flatten = compute_flatten_shape(input->info());
-  _flatten_output.allocator()->init(input->info()
-                                        ->clone()
-                                        ->set_is_resizable(true)
-                                        .reset_padding()
-                                        .set_tensor_shape(shape_flatten)
-                                        .set_data_layout(DataLayout::NCHW));
-
-  // Configure flatten kernel
-  _memory_group.manage(&_flatten_output);
-  _flatten_layer.configure(input, &_flatten_output);
-
-  // Configure matrix multiply kernel
-  configure_mm(&_flatten_output, weights, bias, output, fc_info);
-
-  // Allocate the output tensor for flatten once all the configure methods have been called
-  _flatten_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayerEx::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights,
-                                              const ICLTensor *bias, ICLTensor *output,
-                                              const FullyConnectedLayerInfo &fc_info)
-{
-  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-  // Configure matrix multiply kernel
-  configure_mm(input, weights, bias, output, fc_info);
-}
-
-void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
-                                        const ICLTensor *biases, ICLTensor *output,
-                                        FullyConnectedLayerInfo fc_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  // Perform validate step
-  ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
-
-  _are_weights_converted = true;
-  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-  _is_fc_after_conv = true;
-  _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-  _is_prepared = fc_info.retain_internal_weights;
-  _original_weights = weights;
-
-  if (_weights_manager)
-  {
-    _weights_manager->manage(weights);
-  }
-
-  const ICLTensor *weights_to_use = weights;
-
-  // With the Fully Connected layer we can have 4 different cases:
-  //  1) Convolution layer -> Fully Connected layer without batches
-  //  2) Fully Connected layer -> Fully Connected layer without batches
-  //  3) Convolution layer -> Fully Connected layer with batches
-  //  4) Fully Connected layer -> Fully Connected layer with batches
-
-  // Check if we have a fully connected layer with batches
-  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-  if (is_batched_fc_layer)
-  {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
-  }
-  else
-  {
-    _is_fc_after_conv = input->info()->num_dimensions() > 1;
-  }
-
-  // Reshape weights if needed
-  if (!_are_weights_reshaped)
-  {
-    if (_weights_manager && _weights_manager->are_weights_managed(weights))
-    {
-      _reshape_weights_managed_function.configure(weights);
-      weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
-          _weights_manager->acquire(weights, &_reshape_weights_managed_function));
-    }
-    else
-    {
-      // Reshape the weights
-      _reshape_weights_function.configure(weights, &_reshape_weights_output);
-      weights_to_use = &_reshape_weights_output;
-    }
-  }
-
-  // Convert weights if needed
-  if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
-  {
-    if (_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
-    {
-      _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(),
-                                         fc_info.weights_trained_layout);
-      weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
-          _weights_manager->acquire(weights, &_convert_weights_managed));
-    }
-    else
-    {
-      // Convert weights
-      _convert_weights.configure(weights_to_use, &_converted_weights_output,
-                                 input->info()->tensor_shape(), fc_info.weights_trained_layout);
-
-      weights_to_use = &_converted_weights_output;
-    }
-    _are_weights_converted = false;
-  }
-
-  if (_is_fc_after_conv)
-  {
-    // Fully Connected layer after a Convolution Layer without batches
-    configure_conv_fc(input, weights_to_use, biases, output, fc_info);
-  }
-  else
-  {
-    // Fully Connected layer after a Fully Connected Layer without batches
-    configure_fc_fc(input, weights_to_use, biases, output, fc_info);
-  }
-}
-
-Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                                         const ITensorInfo *biases, const ITensorInfo *output,
-                                         FullyConnectedLayerInfo fc_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-
-  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-  bool is_fc_after_conv = true;
-
-  const ITensorInfo &flatten_input = TensorInfo(input->clone()
-                                                    ->set_is_resizable(true)
-                                                    .reset_padding()
-                                                    .set_tensor_shape(compute_flatten_shape(input))
-                                                    .set_data_layout(DataLayout::NCHW));
-  const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
-  const ITensorInfo &converted_weights =
-      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
-                       : TensorInfo(*reshaped_weights.clone());
-
-  // With the Fully Connected layer we can have 4 different cases:
-  //  1) Convolution layer -> Fully Connected layer without batches
-  //  2) Fully Connected layer -> Fully Connected layer without batches
-  //  3) Convolution layer -> Fully Connected layer with batches
-  //  4) Fully Connected layer -> Fully Connected layer with batches
-
-  const ITensorInfo *input_to_use = input;
-  const ITensorInfo *weights_to_use = weights;
-
-  // Check if we have a fully connected layer with batches
-  const bool is_batched_fc_layer = output->dimension(1) > 1;
-  if (is_batched_fc_layer)
-  {
-    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
-                                   output->tensor_shape().cbegin() + 1));
-  }
-  else
-  {
-    is_fc_after_conv = input->num_dimensions() > 1;
-  }
-
-  if (!weights_reshaped)
-  {
-    // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
-    weights_to_use = &reshaped_weights;
-  }
-
-  if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
-  {
-    // Validate convert weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(
-        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
-    weights_to_use = &converted_weights;
-  }
-
-  if (is_fc_after_conv)
-  {
-    // Fully Connected layer after a Convolution Layer without batches
-    ARM_COMPUTE_RETURN_ERROR_ON(
-        (weights_to_use->dimension(1) !=
-         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
-
-    // Validate flatten kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
-    input_to_use = &flatten_input;
-  }
-  else
-  {
-    // Fully Connected layer after a Fully Connected Layer without batches
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
-  }
-
-  // Validate matrix multiply kernel
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
-
-  return Status{};
-}
-
-void CLFullyConnectedLayerEx::run()
-{
-  if (!_is_prepared)
-  {
-    if (!_are_weights_reshaped)
-      _reshape_weights_output.allocator()->allocate();
-    if (!_are_weights_converted)
-      _converted_weights_output.allocator()->allocate();
-    _is_prepared = true;
-  }
-
-  {
-    if (!_weights_manager)
-    {
-      ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-    }
-
-    // Pointer to current weights
-    const ICLTensor *cur_weights = _original_weights;
-    // Reshape of the weights
-    if (!_are_weights_reshaped)
-    {
-      if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
-      {
-        _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>(
-            _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
-      }
-      else
-      {
-        _reshape_weights_function.run();
-        cur_weights = &_reshape_weights_output;
-      }
-    }
-
-    // Convert weights if needed
-    if (!_are_weights_converted)
-    {
-      if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
-      {
-        _weights_manager->run(cur_weights, &_convert_weights_managed);
-      }
-      else
-      {
-        _convert_weights.run();
-      }
-    }
-
-    // Prepare GEMM prepare
-    if (!_is_quantized)
-    {
-      _mm_gemm.prepare();
-    }
-  }
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Linearize input if it comes from a convolutional layer
-  if (_is_fc_after_conv)
-  {
-    _flatten_layer.run();
-  }
-
-  // Run matrix multiply
-  if (_is_quantized)
-  {
-    _mm_gemmlowp.run();
-  }
-  else
-  {
-    _mm_gemm.run();
-  }
-}
-
-void CLFullyConnectedLayerEx::prepare()
-{
-#if 0 // TODO Remove this block
-    if(!_is_prepared)
-    {
-        if(!_weights_manager)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-        }
-
-        auto release_unused = [](CLTensor * w)
-        {
-            if(!w->is_used())
-            {
-                CLScheduler::get().queue().finish();
-                w->allocator()->free();
-            }
-        };
-
-        // Pointer to current weights
-        const ICLTensor *cur_weights = _original_weights;
-
-        // Reshape of the weights if needed (happens only once)
-        if(!_are_weights_reshaped)
-        {
-            if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
-            {
-                cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function));
-            }
-            else
-            {
-                // Run reshape weights kernel and mark weights as unused
-                _reshape_weights_output.allocator()->allocate();
-                _reshape_weights_function.run();
-
-                cur_weights->mark_as_unused();
-                cur_weights = &_reshape_weights_output;
-            }
-            _are_weights_reshaped = true;
-        }
-
-        // Convert weights if needed (happens only once)
-        if(!_are_weights_converted)
-        {
-            if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
-            {
-                _weights_manager->run(cur_weights, &_convert_weights_managed);
-            }
-            else
-            {
-                _converted_weights_output.allocator()->allocate();
-                _convert_weights.run();
-                cur_weights->mark_as_unused();
-            }
-
-            _are_weights_converted = true;
-        }
-
-        // Release reshaped weights if unused
-        release_unused(&_reshape_weights_output);
-
-        // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized)
-        {
-            _mm_gemm.prepare();
-        }
-
-        // Release converted weights if unused
-        release_unused(&_reshape_weights_output);
-        release_unused(&_converted_weights_output);
-
-        _is_prepared = true;
-    }
-#endif
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
deleted file mode 100644
index 157b4d977..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h"
-
-#include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h>
-#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
-#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h>
-
-using namespace arm_compute;
-
-void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input,
-                                               const arm_compute::ICLTensor *weights,
-                                               const arm_compute::ICLTensor *biases,
-                                               arm_compute::ICLTensor *output, bool needs_reshape,
-                                               const arm_compute::TensorShape &reshape,
-                                               KernelType kernel_type)
-{
-  _input = input;
-  _weights = weights;
-  _biases = biases;
-  _output = output;
-  _needs_reshape = needs_reshape;
-
-  const ICLTensor *input_to_use = input;
-  if (_needs_reshape)
-  {
-    // reshape
-    auto_init_if_empty(*_cl_buffer.info(),
-                       _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
-                           _input->info()->data_layout()));
-    _cl_reshape.configure(_input, &_cl_buffer);
-    input_to_use = &_cl_buffer;
-  }
-
-  _cl_fc = [&]() {
-    if (kernel_type == KernelType::GENERAL)
-    {
-      auto fc = new arm_compute::CLFullyConnectedLayerEx{_memory_manager};
-      fc->configure(input_to_use, _weights, _biases, _output);
-      return std::unique_ptr<arm_compute::IFunction>(fc);
-    }
-    else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
-    {
-      bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
-                        input->info()->data_type() == DataType::F16) &&
-                       (weights->info()->data_type() == DataType::S8 ||
-                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
-
-      if (is_hybrid)
-      {
-        auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
-        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
-        const auto orgin_weights_data_type = weights_info->data_type();
-        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
-        fc->configure(input_to_use, _weights, _biases, _output);
-        weights_info->set_data_type(orgin_weights_data_type);
-        return std::unique_ptr<arm_compute::IFunction>(fc);
-      }
-      else
-      {
-        auto fc = new arm_compute::CLFullyConnectedLayer{_memory_manager};
-        fc->configure(input_to_use, _weights, _biases, _output);
-        return std::unique_ptr<arm_compute::IFunction>(fc);
-      }
-    }
-    else
-    {
-      throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
-    }
-
-  }();
-
-  if (_needs_reshape)
-  {
-    // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
-    _cl_buffer.allocator()->allocate();
-  }
-}
-
-void CLFullyConnectedReshapingLayer::run(void)
-{
-  if (_needs_reshape)
-    _cl_reshape.run();
-
-  _cl_fc->run();
-}
-
-void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
deleted file mode 100644
index e0b833b04..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLGatherEx.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
-
-using namespace arm_compute;
-
-void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
-                           int axis)
-{
-  auto k = support::cpp14::make_unique<CLGatherExKernel>();
-  k->configure(input, indices, output, axis);
-  _kernel = std::move(k);
-}
-
-Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
-                            const ITensorInfo *output, int axis)
-{
-  return CLGatherExKernel::validate(input, indices, output, axis);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
deleted file mode 100644
index 65b89a389..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
-
-#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
-
-using namespace arm_compute;
-
-void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
-                                  const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
-{
-  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
-  k->configure(lookups, keys, input, output, hits);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
deleted file mode 100644
index 5a7e40839..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h"
-
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
-
-void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
-                                               ICLTensor *gamma, ICLTensor *beta, float epsilon)
-{
-  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
-  k->configure(input, output, gamma, beta, epsilon);
-  _kernel = std::move(k);
-}
-
-Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                const ITensorInfo *gamma, const ITensorInfo *beta,
-                                                float epsilon)
-{
-  return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
deleted file mode 100644
index 28e5bc0da..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLNeg.h"
-
-#include "arm_compute/core/CL/kernels/CLNegKernel.h"
-
-using namespace arm_compute;
-
-void CLNeg::configure(ICLTensor *input, ICLTensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
deleted file mode 100644
index aa9f32ec6..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLOneHot.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
-namespace arm_compute
-{
-CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {}
-void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value,
-                         const ICLTensor *off_value, ICLTensor *output, int depth, int axis)
-{
-  _onehot_kernel.configure(indices, on_value, off_value, output, depth, axis);
-}
-void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
-                         PixelValue off_value, int depth, int axis)
-{
-  _has_to_memset = true;
-  _memset_kernel.configure(output, off_value);
-  _onehot_kernel.configure(indices, on_value, output, depth, axis);
-}
-Status CLOneHot::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
-                          const ITensorInfo *off_value, const ITensorInfo *output, int depth,
-                          int axis)
-{
-  return CLOneHotKernel::validate(indices, on_value, off_value, output, depth, axis);
-}
-void CLOneHot::run()
-{
-  if (_has_to_memset)
-  {
-    CLScheduler::get().enqueue(_memset_kernel, true);
-  }
-
-  CLScheduler::get().enqueue(_onehot_kernel, false);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
deleted file mode 100644
index b198e7330..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
-
-#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
-      _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
-{
-}
-
-Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                   const std::set<uint32_t> &axis, bool keep_dims,
-                                   const ReduceOperation &op)
-{
-  const size_t num_of_kernels = axis.size();
-  const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1);
-
-  // Create temporary tensor infos
-  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
-
-  // Create intermediate tensor info
-  TensorShape shape{input->tensor_shape()};
-
-  auto it = axis.begin();
-  for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
-  {
-    shape.set(*it, 1, false);
-    interm_tensors[i].set_data_type(input->data_type());
-    interm_tensors[i].set_tensor_shape(shape);
-    interm_tensors[i].set_num_channels(input->num_channels());
-    interm_tensors[i].set_data_layout(input->data_layout());
-    interm_tensors[i].set_quantization_info(input->quantization_info());
-  }
-
-  // Set a vector that is ordered ITensorInfo sequentially.
-  std::vector<const ITensorInfo *> tensors;
-  tensors.emplace_back(input);
-  for (size_t i = 0; i < num_of_interm_tensors; ++i)
-  {
-    tensors.emplace_back(interm_tensors.get() + i);
-  }
-  tensors.emplace_back(output);
-
-  // Validate ReduceOperation only on all kernels
-  it = axis.begin();
-  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
-  }
-
-  if (!keep_dims)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
-  }
-
-  return Status{};
-}
-
-void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
-                                  const std::set<uint32_t> &axis, bool keep_dims,
-                                  ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op));
-
-  _axis = axis;
-
-  _input = input;
-  _output = output;
-  _keep_dims = keep_dims;
-
-  // NOTE The axis must have no duplication.
-  const size_t num_of_kernels = axis.size();
-  const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
-
-  if (num_of_kernels < 1)
-  {
-    throw std::runtime_error("CLReduceOperation: there is no axis to reduce");
-  }
-
-  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
-
-  // Set a vector that is ordered ICLTensors sequentially.
-  std::vector<ICLTensor *> tensors;
-  tensors.emplace_back(input);
-  for (size_t i = 0; i < num_of_interm_tensors; ++i)
-  {
-    tensors.emplace_back(_interm_tensors.get() + i);
-  }
-  tensors.emplace_back(output);
-
-  // Apply ReduceOperation on all kernels
-  TensorShape shape{input->info()->tensor_shape()};
-  auto it = axis.begin();
-  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
-  {
-    shape.set(*it, 1, false);
-    if (!keep_dims || i != (num_of_kernels - 1))
-    {
-      _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
-      _memory_group.manage(&_interm_tensors[i]);
-    }
-    _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
-    if (i != 0)
-    {
-      _interm_tensors[i - 1].allocator()->allocate();
-    }
-  }
-
-  // Configure reshape layer if we want to drop the dimensions
-  if (!keep_dims)
-  {
-    _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output);
-    _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate();
-  }
-}
-
-void CLReduceOperation::run()
-{
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  const size_t num_of_kernels = _axis.size();
-  for (size_t i = 0; i < num_of_kernels; ++i)
-  {
-    CLScheduler::get().enqueue(_reduce_kernels[i]);
-  }
-
-  if (!_keep_dims)
-  {
-    _reshape.run();
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
deleted file mode 100644
index a502f032e..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSplitVEx.h"
-#include "support/ToolchainSupport.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include <cassert>
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ICLTensor *size_splits, const std::vector<ICLTensor *> &outputs,
-                          unsigned int num_splits)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(size_splits->info()->num_dimensions() != 1,
-                                  "size_splits must be a 1-D tensor.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_splits != outputs.size(),
-                                  "Number of output tensors does not match number of splits.");
-  return Status{};
-}
-
-Status validate_slices(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs,
-                       uint32_t split_dim)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-  ARM_COMPUTE_RETURN_ERROR_ON(split_dim >= input->num_dimensions());
-  ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
-
-  // Start/End coordinates
-  Coordinates start_coords;
-  Coordinates end_coords;
-  for (unsigned int d = 0; d < input->num_dimensions(); ++d)
-  {
-    end_coords.set(d, -1);
-  }
-  unsigned int axis_offset = 0;
-  // Validate output tensors
-  for (const auto &output : outputs)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    // Get output shape
-    const TensorShape output_shape = output->tensor_shape();
-    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
-
-    const size_t axis_split_step = output_shape[split_dim];
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo tmp_output_info = *output->clone();
-    auto_init_if_empty(tmp_output_info,
-                       input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
-
-    // Update coordinate on axis
-    start_coords.set(split_dim, axis_offset);
-    end_coords.set(split_dim, axis_offset + axis_split_step);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords));
-
-    axis_offset += axis_split_step;
-  }
-
-  return Status{};
-}
-
-void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &outputs,
-                      std::vector<CLSlice> &_slice_functions, uint32_t split_dim)
-{
-  unsigned int axis_offset = 0;
-  // Start/End coordinates
-  Coordinates start_coords;
-  Coordinates end_coords;
-  for (unsigned int d = 0; d < input->info()->num_dimensions(); ++d)
-  {
-    end_coords.set(d, -1);
-  }
-  int out_iter = 0;
-  for (const auto &output : outputs)
-  {
-    const TensorShape output_shape = output->info()->tensor_shape();
-    auto op_size = output_shape.total_size();
-    if (!op_size)
-    {
-      continue;
-    }
-
-    assert(op_size != 0);
-    assert(split_dim <= output_shape.num_dimensions());
-
-    const size_t axis_split_step = output_shape[split_dim];
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo tmp_output_info = *output->info()->clone();
-    auto_init_if_empty(
-        tmp_output_info,
-        input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
-
-    // Update coordinate on axis
-    start_coords.set(split_dim, axis_offset);
-    end_coords.set(split_dim, axis_offset + axis_split_step);
-
-    // Configure slice function
-    _slice_functions[out_iter].configure(input, output, start_coords, end_coords);
-
-    // Set valid region from shape
-    outputs[out_iter++]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
-    axis_offset += axis_split_step;
-  }
-}
-
-} // namespace
-
-CLSplitVEx::CLSplitVEx()
-    : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions()
-{
-}
-
-void CLSplitVEx::configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim,
-                           const std::vector<ICLTensor *> &outputs, unsigned int num_splits)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, size_splits);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(size_splits, outputs, num_splits));
-
-  _input = input;
-  _size_splits = size_splits;
-  _outputs = outputs;
-  _num_splits = num_splits;
-
-  // Create tensor slices
-  _slice_functions.resize(_num_splits);
-
-  // Extract output tensor info
-  std::vector<ITensorInfo *> outputs_info;
-  for (auto &output : _outputs)
-  {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    outputs_info.emplace_back(output->info());
-  }
-
-  // Validate slices
-  ARM_COMPUTE_ERROR_THROW_ON(validate_slices(_input->info(), outputs_info, split_dim));
-
-  // Configure slices
-  configure_slices(_input, _outputs, _slice_functions, split_dim);
-}
-
-void CLSplitVEx::run()
-{
-  // execute the slices
-  for (unsigned i = 0; i < _outputs.size(); ++i)
-  {
-    _slice_functions[i].run();
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
deleted file mode 100644
index 3ac95a8e6..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-
-#include "../../topk_v2.h"
-
-namespace arm_compute
-{
-
-CLTopKV2::CLTopKV2()
-    : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
-      _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
-      _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
-      _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
-      _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
-       _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
-       _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
-       _reorder_negatives_kernel(), _store_kernel()*/
-{
-}
-
-void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
-                         int total_bits, int bits)
-{
-  _total_bits = total_bits;
-  _bits = bits;
-  _n = input->info()->tensor_shape()[0];
-
-  // _total_bits should be divided by _bits.
-  ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
-
-  _k = k;
-  _radix = 1 << bits;
-
-  _input = input;
-  _values = values;
-  _indices = indices;
-
-  std::string topk_env;
-
-// Disable GPU implementation
-// TODO Enable GPU implementation with verification, or remove code
-//      Invalid result on GPU
-#if 0
-  char *env = getenv("ACL_TOPKV2");
-  if (env)
-    topk_env = env;
-
-  if (topk_env == "GPU_SINGLE")
-  {
-    _qs_idx_buf = cl::Buffer(CLScheduler::get().context(),
-                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
-    _qs_temp_buf = cl::Buffer(CLScheduler::get().context(),
-                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
-
-    _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
-  }
-  else if (topk_env == "GPU")
-  {
-    // n should be divided by (_GROUPS * _ITEMS)
-    ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0);
-
-    _hist_buf_size = _radix * _GROUPS * _ITEMS;
-    _glob_sum_buf_size = _HISTOSPLIT;
-
-    _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
-                           sizeof(cl_int) * _hist_buf_size);
-    _glob_sum_buf =
-        cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
-                   sizeof(cl_int) * _glob_sum_buf_size);
-    _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
-                           sizeof(cl_int) * _glob_sum_buf_size);
-    _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(),
-                                         CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int));
-    _in_key_buf = cl::Buffer(CLScheduler::get().context(),
-                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
-    _out_key_buf = cl::Buffer(CLScheduler::get().context(),
-                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
-    _in_ind_buf = cl::Buffer(CLScheduler::get().context(),
-                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
-    _out_ind_buf = cl::Buffer(CLScheduler::get().context(),
-                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
-
-    _p_in_key_buf = &_in_key_buf;
-    _p_out_key_buf = &_out_key_buf;
-    _p_in_ind_buf = &_in_ind_buf;
-    _p_out_ind_buf = &_out_ind_buf;
-
-    _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
-    _hist_kernel.configure(&_hist_buf, bits, _n);
-    _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
-    _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
-    _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
-    _reorder_kernel.configure(&_hist_buf, bits, _n);
-    _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
-    _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
-    _store_kernel.configure(values, indices, k, _n);
-  }
-  else
-#endif // Disable GPU implementation
-  {
-    // DO NOTHING for CPU.
-  }
-}
-
-void CLTopKV2::run()
-{
-  std::string topk_env;
-#if 0
-  char *env = getenv("ACL_TOPKV2");
-  if (env)
-    topk_env = env;
-
-  if (topk_env == "GPU_SINGLE")
-  {
-    run_on_gpu_single_quicksort();
-  }
-  else if (topk_env == "GPU")
-  {
-    run_on_gpu();
-  }
-  else
-#endif
-  {
-    run_on_cpu();
-  }
-}
-
-#if 0
-void CLTopKV2::run_on_gpu_single_quicksort()
-{
-  // This is a single threaded quick sort implementation.
-  CLScheduler::get().enqueue(_qs_kernel, false);
-
-  arm_compute::CLScheduler::get().sync();
-}
-
-void CLTopKV2::run_on_gpu()
-{
-  cl::CommandQueue q = CLScheduler::get().queue();
-
-  // 1. CLTopKV2Init set key buffer and index buffer.
-  //  - Key buffer is set as the same value of the layer's input
-  //  - Values in the index buffer are set as their indices.
-  CLScheduler::get().enqueue(_init_kernel, false);
-
-  int n_passes = _total_bits / _bits;
-
-  // 2. Repeat (total_bits/bits) times.
-  //   - total_bits is the number of bits of the data type (e.g., 32 for float)
-  //   - bits defines number of buckets (e.g. 16 buckets where bit is 4)
-  for (int pass = 0; pass < n_passes; ++pass)
-  {
-    arm_compute::CLScheduler::get().sync();
-
-    // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
-    _hist_kernel.setPass(pass, _p_in_key_buf);
-    CLScheduler::get().enqueue(_hist_kernel, false);
-
-    // 2.2. Calculate prefix sum locally with multiple threads
-    CLScheduler::get().enqueue(_scan_hist_kernel, false);
-    // 2.3. Calculate prefix sum within a work group
-    CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
-    // 2.4. Calculate global prefix sum
-    CLScheduler::get().enqueue(_paste_hist_kernel, false);
-
-    // 2.5. Reorder keys and indices based on the global prefix sum
-    _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf);
-    CLScheduler::get().enqueue(_reorder_kernel, false);
-
-    cl::Buffer *tmp;
-    // swap key buffers
-    tmp = _p_in_key_buf;
-    _p_in_key_buf = _p_out_key_buf;
-    _p_out_key_buf = tmp;
-
-    // swap index buffers
-    tmp = _p_in_ind_buf;
-    _p_in_ind_buf = _p_out_ind_buf;
-    _p_out_ind_buf = tmp;
-  }
-
-  // 3. Get the first negative index
-  // Because we swap in_buf and out_buf at the end of the above for loop,
-  // the output buffers are in bufs.
-  _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
-  CLScheduler::get().enqueue(_find_first_negative_kernel, false);
-
-  // 4. Correct odering of negatives
-  //   - Since radix sort does not consider negatives, negatives are considered as bigger values
-  //   than positives.
-  // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
-  _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf,
-                                       _p_out_ind_buf);
-  CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
-
-  // 5. Extract top k values from sorted keys and indices.
-  _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
-  CLScheduler::get().enqueue(_store_kernel, false);
-
-  arm_compute::CLScheduler::get().sync();
-
-#if 0
-  // below code is left for debugging.
-  int first_neg;
-  q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
-  std::cout << "first neg = " << first_neg << std::endl;
-
-  float in_key[_n];
-  q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
-  for(uint32_t i = 0 ; i < _n; ++i) {
-    std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
-  }
-
-  float out_key[_n];
-  q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
-  for(uint32_t i = 0 ; i < _n; ++i) {
-    std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
-  }
-
-  int in_ind[_n];
-  q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
-  for(uint32_t i = 0 ; i < _n; ++i) {
-    std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
-  }
-
-  int out_ind[_n];
-  q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
-  for(uint32_t i = 0 ; i < _n; ++i) {
-    std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
-  }
-
-  int hist_buf[_hist_buf_size];
-  q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
-  for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
-    std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
-  }
-
-  int glob_sum_buf[_glob_sum_buf_size];
-  q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
-  for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
-    std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
-  }
-
-#endif
-}
-#endif // Disable GPU implementation
-
-void CLTopKV2::run_on_cpu()
-{
-  cl::CommandQueue q = CLScheduler::get().queue();
-  // const Window& w = _topkv2_kernel.window();
-
-  _input->map(q);
-  _values->map(q);
-  _indices->map(q);
-
-  // int row_size = (w[0].end() - w[0].start()) / w[0].step();
-  int row_size = _input->info()->tensor_shape()[0];
-  int rank = _input->info()->num_dimensions();
-
-  if (rank > 2)
-    throw std::runtime_error("Not supported type.");
-
-  int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
-
-  if (_input->info()->data_type() == DataType::F32)
-  {
-    nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k,
-                                         (int32 *)_indices->buffer(), (float *)_values->buffer());
-  }
-  else if (_input->info()->data_type() == DataType::S32)
-  {
-    nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k,
-                                           (int32 *)_indices->buffer(),
-                                           (int32_t *)_values->buffer());
-  }
-  else if (_input->info()->data_type() == DataType::QASYMM8)
-  {
-    nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k,
-                                           (int32 *)_indices->buffer(),
-                                           (uint8_t *)_values->buffer());
-  }
-  else
-  {
-    throw std::runtime_error("Not supported type.");
-  }
-
-  _input->unmap(q);
-  _values->unmap(q);
-  _indices->unmap(q);
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
deleted file mode 100644
index 3215d01a7..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _function()
-{
-}
-
-void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
-                                     ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     unsigned int invalid_right, unsigned int invalid_bottom,
-                                     const WeightsInfo &weights_info)
-{
-  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info,
-            invalid_right, invalid_bottom, weights_info);
-}
-
-void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input,
-                                     ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                     const PadStrideInfo &deconv_info, unsigned int invalid_right,
-                                     unsigned int invalid_bottom, const WeightsInfo &weights_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr,
-                                                         output->info(), deconv_info, invalid_right,
-                                                         invalid_bottom, weights_info))
-  {
-    case DeconvolutionMethod::DIRECT:
-    {
-      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
-      f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
-                   invalid_bottom, weights_info);
-      _function = std::move(f);
-      break;
-    }
-    case DeconvolutionMethod::GEMM:
-    {
-      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
-      f->configure(compile_context, input, weights, bias, output, deconv_info);
-      _function = std::move(f);
-      break;
-    }
-    default:
-      ARM_COMPUTE_ERROR("Not supported.");
-      break;
-  }
-}
-
-Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                                      const ITensorInfo *bias, ITensorInfo *output,
-                                      const PadStrideInfo &deconv_info, unsigned int invalid_right,
-                                      unsigned int invalid_bottom, const WeightsInfo &weights_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  switch (CLTransposeConvLayer::get_deconvolution_method(
-      input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
-  {
-    case DeconvolutionMethod::DIRECT:
-    {
-      // Validate direct convolution layer
-      ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
-          input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
-      break;
-    }
-    case DeconvolutionMethod::GEMM:
-    {
-      // Validate gemm-based convolution layer
-      ARM_COMPUTE_RETURN_ON_ERROR(
-          CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
-      break;
-    }
-    default:
-      ARM_COMPUTE_ERROR("Not supported.");
-      break;
-  }
-
-  return Status{};
-}
-
-DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
-    const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
-    ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
-    unsigned int invalid_bottom, const WeightsInfo &weights_info)
-{
-  ARM_COMPUTE_UNUSED(output, bias, weights_info);
-
-  const DataLayout data_layout = input->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-  if (weights->dimension(idx_w) != deconv_info.stride().first ||
-      weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 ||
-      invalid_bottom != 0)
-  {
-    return DeconvolutionMethod::DIRECT;
-  }
-
-  return DeconvolutionMethod::GEMM;
-}
-
-void CLTransposeConvLayer::run()
-{
-  prepare();
-  _function->run();
-}
-
-void CLTransposeConvLayer::prepare() { _function->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
deleted file mode 100644
index 80fbf359d..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "arm_compute/runtime/NEON/NEFunctionsEx.h"
-
-// NOTE This empty file aims to validate "NEFunctionsEx.h".
-//      DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
deleted file mode 100644
index 2752eb6aa..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEActivationLayerEx.h"
-
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
-#include "arm_compute/runtime/IRuntimeContext.h"
-#include "support/MemorySupport.h"
-
-namespace arm_compute
-{
-NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT
-    : INESimpleFunctionNoBorder(ctx)
-{
-}
-void NEActivationLayerEx::configure(ITensor *input, ITensor *output,
-                                    ActivationLayerInfo activation_info)
-{
-  auto k = support::cpp14::make_unique<NEActivationLayerKernelEx>();
-  k->configure(input, output, activation_info);
-  _kernel = std::move(k);
-}
-
-Status NEActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                     const ActivationLayerInfo &act_info)
-{
-  return NEActivationLayerKernelEx::validate(input, output, act_info);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
deleted file mode 100644
index 2fc94b267..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
-#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
-
-#include "arm_compute/core/ITensor.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-
-template <BinaryLogicalOperation COP>
-void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
-                                                    ITensor *output)
-{
-  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
-  k->configure(COP, input1, input2, output);
-  _kernel = std::move(k);
-}
-
-template <BinaryLogicalOperation COP>
-Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
-                                                     const ITensorInfo *input2,
-                                                     const ITensorInfo *output)
-{
-  return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output);
-}
-
-void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
-                                         BinaryLogicalOperation op)
-{
-  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
-  k->configure(op, input1, input2, output);
-  _kernel = std::move(k);
-}
-
-Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                          const ITensorInfo *output, BinaryLogicalOperation op)
-{
-  return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output);
-}
-
-// Supported Specializations
-template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
-template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
deleted file mode 100644
index 6ad3e1b12..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NECastBool.h"
-
-#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void NECastBool::configure(const ITensor *input, ITensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
-
-Status NECastBool::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-  return NECastBoolKernel::validate(input, output);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
deleted file mode 100644
index e0ab3e025..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
-
-#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
-{
-  auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
-  k->configure(input, output, lookups);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
deleted file mode 100644
index a123439d9..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <algorithm>
-#include <cmath>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
-
-  return Status{};
-}
-} // namespace
-
-void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
-{
-  auto k = support::cpp14::make_unique<NETransposeKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
-
-Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
-                                                           const ITensorInfo *output)
-{
-  return NETransposeKernel::validate(input, output);
-}
-
-NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
-      _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
-      _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
-      _accumulate_biases(false), _is_prepared(false)
-{
-}
-
-void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights,
-                                               ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-  // Configure gemmlowp function
-  _mm_gemmlowp.configure(input, weights, nullptr, output);
-}
-
-void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights,
-                                            const ITensor *biases, ITensor *output,
-                                            FullyConnectedLayerInfo fc_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  // Perform validate step
-  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
-
-  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-  _accumulate_biases = false;
-  _original_weights = weights;
-
-  // Configure accumulate biases kernel for non quantized asymmetric types
-  if (biases != nullptr)
-  {
-    _accumulate_biases = true;
-
-    // Configure accumulate biases kernel
-    _accumulate_biases_kernel.configure(output, biases);
-  }
-
-  // With the Fully Connected layer we can have 4 different cases:
-  //  1) Convolution layer -> Fully Connected layer without batches
-  //  2) Fully Connected layer -> Fully Connected layer without batches
-  //  3) Convolution layer -> Fully Connected layer with batches
-  //  4) Fully Connected layer -> Fully Connected layer with batches
-
-  const ITensor *weights_to_use = weights;
-
-  // Check if we have a fully connected layer with batches
-  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-  bool _is_fc_after_conv;
-  if (is_batched_fc_layer)
-  {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
-  }
-  else
-  {
-    _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
-  }
-  ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv,
-                           "NEFullyConnectedHybridLayer does not support after conv");
-  (void)_is_fc_after_conv;
-
-  // Reshape weights if needed
-  if (!_are_weights_reshaped)
-  {
-    // Reshape the weights
-    _reshape_weights_output.allocator()->init(
-        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-            compute_transposed_shape(*weights->info())));
-    _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
-    weights_to_use = &_reshape_weights_output;
-  }
-
-  // Quantize input
-  _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
-  _scale_factor.allocator()->init(
-      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
-  _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
-
-  // GEMM
-  _gemmlowp_output.allocator()->init(
-      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-  configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
-
-  // Multiply scale
-  _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
-                                   weights->info()->quantization_info().uniform().scale);
-
-  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
-
-  _quantized_input.allocator()->allocate();
-  _scale_factor.allocator()->allocate();
-  _gemmlowp_output.allocator()->allocate();
-}
-
-Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                                             const ITensorInfo *biases, const ITensorInfo *output,
-                                             FullyConnectedLayerInfo fc_info)
-{
-  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-
-  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-
-  const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
-
-  // Configure accumulate biases kernel for non quantized asymmetric types
-  if (biases != nullptr)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
-  }
-
-  // With the Fully Connected layer we can have 4 different cases:
-  //  1) Convolution layer -> Fully Connected layer without batches
-  //  2) Fully Connected layer -> Fully Connected layer without batches
-  //  3) Convolution layer -> Fully Connected layer with batches
-  //  4) Fully Connected layer -> Fully Connected layer with batches
-
-  const ITensorInfo *weights_to_use = weights;
-
-  if (!weights_reshaped)
-  {
-    // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
-    weights_to_use = &reshaped_weights;
-  }
-
-  // Fully Connected layer after a Fully Connected Layer without batches
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
-
-  // Validate quantization kernel
-  const ITensorInfo &quantized_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
-  const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
-
-  const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-  // Validate matrix multiply kernel
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
-
-  ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
-      &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
-
-  return Status{};
-}
-
-void NEFullyConnectedHybridLayer::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Quantize input
-  NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY);
-
-  // Run matrix multiply
-  _mm_gemmlowp.run();
-
-  // Multiply scale factor
-  NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY);
-
-  // Accumulate biases if provided
-  if (_accumulate_biases)
-  {
-    NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
-  }
-}
-
-void NEFullyConnectedHybridLayer::prepare()
-{
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    auto release_unused = [](Tensor *w) {
-      if (!w->is_used())
-      {
-        w->allocator()->free();
-      }
-    };
-
-    // Reshape of the weights (happens only once)
-    if (!_are_weights_reshaped)
-    {
-      // Run reshape weights kernel and mark weights as unused
-      _reshape_weights_output.allocator()->allocate();
-      _reshape_weights_function.run();
-
-      _are_weights_reshaped = true;
-      // We can not release _original_weights because it can be used in other nodes
-    }
-
-    // Prepare GEMM prepare and release unused weights
-    _mm_gemmlowp.prepare();
-
-    // Release reshaped weights if unused
-    release_unused(&_reshape_weights_output);
-
-    _is_prepared = true;
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
deleted file mode 100644
index cb7557a5a..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ /dev/null
@@ -1,494 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <algorithm>
-#include <cmath>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
-{
-  if (is_data_type_quantized_asymmetric(input.data_type()))
-  {
-    // Since we need negative offsets for computing convolution, we need to change
-    // QuantizationInfo()
-    // Extract and negate input and weights offset
-    const QuantizationInfo input_quantization_info(input.quantization_info().uniform().scale,
-                                                   -input.quantization_info().uniform().offset);
-    const QuantizationInfo weights_quantization_info(weights.quantization_info().uniform().scale,
-                                                     -weights.quantization_info().uniform().offset);
-
-    // Validate gemmlowp function
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
-        &input.clone()->set_quantization_info(input_quantization_info),
-        &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
-        &input, &weights, nullptr, &output, 1.f, 0.0f,
-        GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
-  }
-
-  return Status{};
-}
-} // namespace
-
-NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
-      _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
-      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
-      _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
-      _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
-      _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
-{
-}
-
-void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights,
-                                           ITensor *output)
-{
-  if (_is_quantized)
-  {
-    // Since we need negative offsets for computing convolution, we need to change
-    // QuantizationInfo()
-    // Extract and negate input and weights offset
-    const QuantizationInfo input_quantization_info = input->info()->quantization_info();
-    const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
-    input->info()->set_quantization_info(QuantizationInfo(
-        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-    weights->info()->set_quantization_info(QuantizationInfo(
-        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
-    // Configure gemmlowp function
-    _mm_gemmlowp.configure(input, weights, nullptr, output);
-
-    // Revert back QuantizatioInfo as input and weights could be used in other fully connected
-    // layers
-    input->info()->set_quantization_info(input_quantization_info);
-    weights->info()->set_quantization_info(weights_quantization_info);
-  }
-  else
-  {
-    // Configure matrix multiply kernel
-    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f,
-                       GEMMInfo(false, false, false /* Reshape weights only for the first run */));
-  }
-}
-
-void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights,
-                                                ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON(
-      (weights->info()->dimension(1) !=
-       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-  // If the fully connected layer is called after a convolution layer, the input tensor must be
-  // linearized
-
-  // Initialize output tensor for flatten
-  TensorShape shape_flatten = compute_flatten_shape(input->info());
-  _flatten_output.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          shape_flatten));
-
-  // Configure flatten kernel
-  _memory_group.manage(&_flatten_output);
-  _flatten_kernel.configure(input, &_flatten_output);
-
-  // Configure matrix multiply kernel
-  configure_mm(&_flatten_output, weights, output);
-
-  // Allocate the output tensor for flatten once all the configure methods have been called
-  _flatten_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights,
-                                              ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-  // Configure matrix multiply kernel
-  configure_mm(input, weights, output);
-}
-
-void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights,
-                                        const ITensor *biases, ITensor *output,
-                                        FullyConnectedLayerInfo fc_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  // Perform validate step
-  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
-
-  _are_weights_converted = true;
-  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-  _is_fc_after_conv = true;
-  _accumulate_biases = false;
-  _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-  _original_weights = weights;
-
-  // Configure gemmlowp output
-  if (_is_quantized)
-  {
-    _gemmlowp_output.allocator()->init(
-        output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-            DataType::S32));
-  }
-
-  // Configure accumulate biases kernel for non quantized asymmetric types
-  if (biases != nullptr && !_is_quantized)
-  {
-    _accumulate_biases = true;
-
-    // Configure accumulate biases kernel
-    _accumulate_biases_kernel.configure(output, biases);
-  }
-
-  // With the Fully Connected layer we can have 4 different cases:
-  //  1) Convolution layer -> Fully Connected layer without batches
-  //  2) Fully Connected layer -> Fully Connected layer without batches
-  //  3) Convolution layer -> Fully Connected layer with batches
-  //  4) Fully Connected layer -> Fully Connected layer with batches
-
-  const ITensor *weights_to_use = weights;
-
-  // Check if we have a fully connected layer with batches
-  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-  if (is_batched_fc_layer)
-  {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
-  }
-  else
-  {
-    _is_fc_after_conv = input->info()->num_dimensions() > 1;
-  }
-
-  // Reshape weights if needed
-  if (!_are_weights_reshaped)
-  {
-    // Reshape the weights
-    _reshape_weights_function.configure(weights, &_reshape_weights_output);
-    weights_to_use = &_reshape_weights_output;
-  }
-
-  // Convert weights if needed
-  if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
-  {
-    // Convert weights
-    _convert_weights.configure(weights_to_use, &_converted_weights_output,
-                               input->info()->tensor_shape(), fc_info.weights_trained_layout);
-
-    weights_to_use = &_converted_weights_output;
-    _are_weights_converted = false;
-  }
-
-  ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
-  if (_is_fc_after_conv)
-  {
-    // Fully Connected layer after a Convolution Layer without batches
-    configure_conv_fc(input, weights_to_use, tmp_output);
-  }
-  else
-  {
-    // Fully Connected layer after a Fully Connected Layer without batches
-    configure_fc_fc(input, weights_to_use, tmp_output);
-  }
-
-  // Configure output stage for asymmetric quantized types
-  if (_is_quantized)
-  {
-    float multiplier = input->info()->quantization_info().uniform().scale *
-                       weights->info()->quantization_info().uniform().scale /
-                       output->info()->quantization_info().uniform().scale;
-    int output_multiplier;
-    int output_shift;
-    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
-                                                               &output_shift);
-    _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
-                                     output_shift,
-                                     output->info()->quantization_info().uniform().offset);
-    _gemmlowp_output.allocator()->allocate();
-  }
-
-  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
-}
-
-Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                                         const ITensorInfo *biases, const ITensorInfo *output,
-                                         FullyConnectedLayerInfo fc_info)
-{
-  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-
-  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-  bool is_fc_after_conv = true;
-  bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-
-  const ITensorInfo &flatten_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_flatten_shape(input)));
-  const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
-  const ITensorInfo &converted_weights =
-      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
-                       : TensorInfo(*reshaped_weights.clone());
-  const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
-  // Configure accumulate biases kernel for non quantized asymmetric types
-  if (biases != nullptr && !is_quantized)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
-  }
-
-  // With the Fully Connected layer we can have 4 different cases:
-  //  1) Convolution layer -> Fully Connected layer without batches
-  //  2) Fully Connected layer -> Fully Connected layer without batches
-  //  3) Convolution layer -> Fully Connected layer with batches
-  //  4) Fully Connected layer -> Fully Connected layer with batches
-
-  const ITensorInfo *input_to_use = input;
-  const ITensorInfo *weights_to_use = weights;
-  const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
-
-  // Check if we have a fully connected layer with batches
-  const bool is_batched_fc_layer = output->dimension(1) > 1;
-
-  if (is_batched_fc_layer)
-  {
-    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
-                                   output->tensor_shape().cbegin() + 1));
-  }
-  else
-  {
-    is_fc_after_conv = input->num_dimensions() > 1;
-  }
-
-  if (!weights_reshaped)
-  {
-    // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
-    weights_to_use = &reshaped_weights;
-  }
-
-  if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
-  {
-    // Validate convert weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
-        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
-    weights_to_use = &converted_weights;
-  }
-
-  if (is_fc_after_conv)
-  {
-    // Fully Connected layer after a Convolution Layer without batches
-    ARM_COMPUTE_RETURN_ERROR_ON(
-        (weights_to_use->dimension(1) !=
-         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
-
-    // Validate flatten kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
-    input_to_use = &flatten_input;
-  }
-  else
-  {
-    // Fully Connected layer after a Fully Connected Layer without batches
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
-  }
-  // Validate matrix multiply kernel
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
-
-  // Validate output stage for asymmetric quantized types
-  if (is_quantized)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
-        &gemmlowp_output, biases, output));
-  }
-
-  return Status{};
-}
-
-void NEFullyConnectedLayerEx::run()
-{
-  if (!_is_prepared)
-  {
-    if (!_are_weights_reshaped)
-      _reshape_weights_output.allocator()->allocate();
-    if (!_are_weights_converted)
-      _converted_weights_output.allocator()->allocate();
-    _is_prepared = true;
-  }
-
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    // Reshape of the weights
-    if (!_are_weights_reshaped)
-    {
-      _reshape_weights_function.run();
-    }
-
-    // Convert weights if needed
-    if (!_are_weights_converted)
-    {
-      _convert_weights.run();
-    }
-
-    // Prepare GEMM prepare
-    if (!_is_quantized)
-    {
-      _mm_gemm.prepare();
-    }
-  }
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Linearize input if it comes from a convolutional layer
-  if (_is_fc_after_conv)
-  {
-    NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
-  }
-
-  // Run matrix multiply
-  if (_is_quantized)
-  {
-    _mm_gemmlowp.run();
-  }
-  else
-  {
-    _mm_gemm.run();
-  }
-
-  // Accumulate biases if provided
-  if (_is_quantized)
-  {
-    _gemmlowp_output_stage.run();
-  }
-  else
-  {
-    if (_accumulate_biases)
-    {
-      NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
-    }
-  }
-}
-
-void NEFullyConnectedLayerEx::prepare()
-{
-#if 0 // TODO Remove this block
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    auto release_unused = [](Tensor *w) {
-      if (!w->is_used())
-      {
-        w->allocator()->free();
-      }
-    };
-
-    // Pointer to current weights
-    const ITensor *cur_weights = _original_weights;
-
-    // Reshape of the weights (happens only once)
-    if (!_are_weights_reshaped)
-    {
-      // Run reshape weights kernel and mark weights as unused
-      _reshape_weights_output.allocator()->allocate();
-      _reshape_weights_function.run();
-
-      cur_weights->mark_as_unused();
-      cur_weights = &_reshape_weights_output;
-      _are_weights_reshaped = true;
-    }
-
-    // Convert weights if needed (happens only once)
-    if (!_are_weights_converted)
-    {
-      _converted_weights_output.allocator()->allocate();
-      _convert_weights.run();
-
-      cur_weights->mark_as_unused();
-      _are_weights_converted = true;
-    }
-
-    // Release reshaped weights if unused
-    release_unused(&_reshape_weights_output);
-
-    // Prepare GEMM prepare and release unused weights
-    if (!_is_quantized)
-    {
-      _mm_gemm.prepare();
-    }
-
-    // Release converted weights if unused
-    release_unused(&_reshape_weights_output);
-    release_unused(&_converted_weights_output);
-
-    _is_prepared = true;
-  }
-#endif
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
deleted file mode 100644
index dc6c78478..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h"
-
-#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
-#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
-#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
-
-using namespace arm_compute;
-
-void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input,
-                                               const arm_compute::ITensor *weights,
-                                               const arm_compute::ITensor *biases,
-                                               arm_compute::ITensor *output, bool needs_reshape,
-                                               const arm_compute::TensorShape &reshape,
-                                               KernelType kernel_type)
-{
-  _input = input;
-  _weights = weights;
-  _biases = biases;
-  _output = output;
-  _needs_reshape = needs_reshape;
-
-  const ITensor *input_to_use = input;
-  if (_needs_reshape)
-  {
-    // reshape
-    auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
-    _neon_reshape.configure(_input, &_neon_buffer);
-    input_to_use = &_neon_buffer;
-  }
-
-  _neon_fc = [&]() {
-    if (kernel_type == KernelType::GENERAL)
-    {
-      auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager};
-      fc->configure(input_to_use, _weights, _biases, _output);
-      return std::unique_ptr<arm_compute::IFunction>(fc);
-    }
-    else
-    {
-      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
-
-      bool is_hybrid = input->info()->data_type() == DataType::F32 &&
-                       (weights->info()->data_type() == DataType::S8 ||
-                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
-
-      if (is_hybrid)
-      {
-        auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
-        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
-        const auto orgin_weights_data_type = weights_info->data_type();
-        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
-        fc->configure(input_to_use, _weights, _biases, _output);
-        weights_info->set_data_type(orgin_weights_data_type);
-        return std::unique_ptr<arm_compute::IFunction>(fc);
-      }
-      else
-      {
-        auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager};
-        fc->configure(input_to_use, _weights, _biases, _output);
-        return std::unique_ptr<arm_compute::IFunction>(fc);
-      }
-    }
-  }();
-
-  // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
-  if (_needs_reshape)
-  {
-    _neon_buffer.allocator()->allocate();
-  }
-}
-
-void NEFullyConnectedReshapingLayer::run(void)
-{
-  if (_needs_reshape)
-    _neon_reshape.run();
-
-  _neon_fc->run();
-}
-
-void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
deleted file mode 100644
index 433c35d58..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
-
-#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
-{
-  auto k = support::cpp14::make_unique<NEGatherKernelEx>();
-  k->configure(input, indices, output, axis);
-  _kernel = std::move(k);
-}
-
-Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
-                            const ITensorInfo *output, int axis)
-{
-  return NEGatherKernelEx::validate(input, indices, output, axis);
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
deleted file mode 100644
index 52d58accf..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
-
-#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
-                                  ITensor *output, ITensor *hits)
-{
-  auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
-  k->configure(lookups, keys, input, output, hits);
-  _kernel = std::move(k);
-}
-
-Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
-                                   const ITensorInfo *input, const ITensorInfo *output,
-                                   const ITensorInfo *hits)
-{
-  return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
deleted file mode 100644
index 16d74e62d..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
-      _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
-{
-}
-
-void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma,
-                                               ITensor *beta, float epsilon)
-{
-  const DataLayout data_layout = input->info()->data_layout();
-
-  // Configure Kernels
-  _is_nchw = data_layout == DataLayout::NCHW;
-
-  if (!_is_nchw)
-  {
-    _memory_group.manage(&_permuted_input);
-    _memory_group.manage(&_permuted_output);
-
-    // Configure the function to transform the input tensor from NHWC -> NCHW
-    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-    _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
-    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
-
-    _permute_output.configure(&_permuted_output, output != nullptr ? output : input,
-                              PermutationVector(2U, 0U, 1U));
-    _permuted_input.allocator()->allocate();
-    _permuted_output.allocator()->allocate();
-  }
-  else
-  {
-    _normalization_kernel.configure(input, output, gamma, beta, epsilon);
-  }
-}
-
-Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                const ITensorInfo *gamma, const ITensorInfo *beta,
-                                                float epsilon)
-{
-  return NEInstanceNormalizationLayerKernelEx::validate(
-      &input->clone()->set_data_layout(DataLayout::NCHW),
-      &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
-}
-
-void NEInstanceNormalizationLayerEx::run()
-{
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Permute input
-  if (!_is_nchw)
-  {
-    _permute_input.run();
-  }
-
-  NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
-
-  // Permute output
-  if (!_is_nchw)
-  {
-    _permute_output.run();
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
deleted file mode 100644
index 275c55024..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEOneHot.h"
-#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "support/MemorySupport.h"
-#include <utility>
-namespace arm_compute
-{
-void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
-                         const ITensor *off_value, ITensor *output, int axis)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>();
-  k->configure(indices, depth, on_value, off_value, output, axis);
-  _kernel = std::move(k);
-}
-Status NEOneHot::validate(const ITensorInfo *indices, const ITensorInfo *depth,
-                          const ITensorInfo *on_value, const ITensorInfo *off_value,
-                          const ITensorInfo *output, int axis)
-{
-  return NEOneHotKernel::validate(indices, depth, on_value, off_value, output, axis);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
deleted file mode 100644
index aedb537e9..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Tensor.h"
-
-using namespace arm_compute;
-
-NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
-      _reduction_ops(), _keep_dims()
-{
-}
-
-Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
-                                   bool keep_dims, const ITensorInfo *output, ReduceOperation op)
-{
-  ARM_COMPUTE_UNUSED(keep_dims);
-  ARM_COMPUTE_UNUSED(op);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
-  TensorShape out_shape = input->tensor_shape();
-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
-  const int input_dims = input->num_dimensions();
-  Coordinates axis_local = reduction_axis;
-
-  // Convert negative axis
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    axis_local[i] = wrap_around(axis_local[i], input_dims);
-  }
-
-  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
-                                input->num_dimensions() - 1);
-    if (output->total_size() > 0 && keep_dims)
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
-    }
-    if (keep_dims)
-    {
-      out_shape.set(axis_local[i], 1);
-    }
-    else
-    {
-      out_shape.remove_dimension(axis_local[i] - i);
-    }
-  }
-  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
-  return Status{};
-}
-
-void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
-                                  ITensor *output, ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _reduction_ops = reduction_axis.num_dimensions();
-  _reduction_kernels.resize(_reduction_ops);
-  _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
-  _keep_dims = keep_dims;
-
-  Coordinates axis_local = reduction_axis;
-  const int input_dims = input->info()->num_dimensions();
-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
-
-  // Convert negative axis
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    axis_local[i] = wrap_around(axis_local[i], input_dims);
-  }
-
-  // Perform reduction for every axis
-  for (unsigned int i = 0; i < _reduction_ops; ++i)
-  {
-    TensorShape out_shape =
-        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
-    out_shape.set(axis_local[i], 1);
-    auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
-
-    if (i == _reduction_ops - 1 && keep_dims)
-    {
-      _reduction_kernels[i].configure(in, output, axis_local[i], op);
-    }
-    else
-    {
-      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
-                                                    input->info()->data_type(),
-                                                    input->info()->quantization_info()));
-      _memory_group.manage(&_reduced_outs[i]);
-      _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op);
-    }
-  }
-
-  // Allocate intermediate tensors
-  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
-  {
-    _reduced_outs[i].allocator()->allocate();
-  }
-
-  // Configure reshape layer if we want to drop the dimensions
-  if (!keep_dims)
-  {
-    TensorShape out_shape = input->info()->tensor_shape();
-
-    // We have to sort the reduction axis vectors in order for remove_dimension
-    // to work properly
-    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-    for (unsigned int i = 0; i < _reduction_ops; ++i)
-    {
-      out_shape.remove_dimension(axis_local[i] - i);
-    }
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-    _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
-  }
-}
-
-void NEReduceOperation::run()
-{
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  for (unsigned int i = 0; i < _reduction_ops; ++i)
-  {
-    _reduction_kernels[i].run();
-  }
-
-  if (!_keep_dims)
-  {
-    _reshape.run();
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
deleted file mode 100644
index 26a887912..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
-      _reduction_ops(), _keep_dims()
-{
-}
-
-Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
-                             bool keep_dims, const ITensorInfo *output)
-{
-  ARM_COMPUTE_UNUSED(keep_dims);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
-  TensorShape out_shape = input->tensor_shape();
-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
-  const int input_dims = input->num_dimensions();
-  Coordinates axis_local = reduction_axis;
-
-  // Convert negative axis
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    axis_local[i] = wrap_around(axis_local[i], input_dims);
-  }
-
-  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
-                                input->num_dimensions() - 1);
-    if (output->total_size() > 0 && keep_dims)
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
-    }
-    if (keep_dims)
-    {
-      out_shape.set(axis_local[i], 1);
-    }
-    else
-    {
-      out_shape.remove_dimension(axis_local[i] - i);
-    }
-  }
-  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
-  return Status{};
-}
-
-void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
-                            ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _reduction_ops = reduction_axis.num_dimensions();
-  _reduction_kernels.resize(_reduction_ops);
-  _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
-  _keep_dims = keep_dims;
-
-  Coordinates axis_local = reduction_axis;
-  const int input_dims = input->info()->num_dimensions();
-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
-
-  // Convert negative axis
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    axis_local[i] = wrap_around(axis_local[i], input_dims);
-  }
-
-  // Perform reduction for every axis
-  for (unsigned int i = 0; i < _reduction_ops; ++i)
-  {
-    TensorShape out_shape =
-        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
-    out_shape.set(axis_local[i], 1);
-    auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
-
-    if (i == _reduction_ops - 1 && keep_dims)
-    {
-      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM);
-    }
-    else
-    {
-      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
-                                                    input->info()->data_type(),
-                                                    input->info()->quantization_info())
-                                             .set_data_layout(input->info()->data_layout()));
-      _memory_group.manage(&_reduced_outs[i]);
-      _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
-                                      ReductionOperation::SUM);
-    }
-  }
-
-  // Allocate intermediate tensors
-  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
-  {
-    _reduced_outs[i].allocator()->allocate();
-  }
-
-  // Configure reshape layer if we want to drop the dimensions
-  if (!keep_dims)
-  {
-    TensorShape out_shape = input->info()->tensor_shape();
-
-    // We have to sort the reduction axis vectors in order for remove_dimension
-    // to work properly
-    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-    for (unsigned int i = 0; i < _reduction_ops; ++i)
-    {
-      out_shape.remove_dimension(axis_local[i] - i);
-    }
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-    _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
-  }
-}
-
-void NEReduceSum::run()
-{
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  for (unsigned int i = 0; i < _reduction_ops; ++i)
-  {
-    _reduction_kernels[i].run();
-  }
-
-  if (!_keep_dims)
-  {
-    _reshape.run();
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
deleted file mode 100644
index 2aa0d2d4b..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-namespace
-{
-/** Define dimension to split the window
- *
- * @param[in] axis Reduction axis
- *
- * @return The dimension to split the window
- */
-size_t reduction_window_split_dimension(unsigned int axis)
-{
-  switch (axis)
-  {
-    case 0:
-      return Window::DimY;
-    case 1:
-    case 2:
-    case 3:
-      return Window::DimX;
-    default:
-      ARM_COMPUTE_ERROR("Unsupported reduction axis");
-  }
-}
-} // namespace
-
-NEReductionOperationEx::NEReductionOperationEx()
-    : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
-{
-}
-
-Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                        unsigned int axis, ReduceOperation op)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op));
-
-  return Status{};
-}
-
-void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis,
-                                       ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      NEReductionOperationEx::validate(input->info(), output->info(), axis, op));
-
-  // Configure reduction kernel
-  _reduction_kernel.configure(input, output, axis, op);
-  _window_split = reduction_window_split_dimension(axis);
-  _reduction_axis = axis;
-
-  if (axis == 0)
-  {
-    // Configure fill border kernel
-    const BorderSize fill_border_size = _reduction_kernel.border_size();
-    PixelValue pixelValue;
-    switch (op)
-    {
-      case ReduceOperation::MIN:
-      {
-        switch (input->info()->data_type())
-        {
-          case DataType::F32:
-          {
-            pixelValue = PixelValue(std::numeric_limits<float>::max());
-            break;
-          }
-          case DataType::F16:
-          {
-            pixelValue = PixelValue(static_cast<half>(65504.0f));
-            break;
-          }
-          case DataType::QASYMM8:
-          {
-            pixelValue =
-                PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
-            break;
-          }
-          default:
-          {
-            ARM_COMPUTE_ERROR("Unsupported DataType");
-          }
-        }
-        break;
-      }
-      case ReduceOperation::MAX:
-      {
-        switch (input->info()->data_type())
-        {
-          case DataType::F32:
-          {
-            pixelValue = PixelValue(-std::numeric_limits<float>::max());
-            break;
-          }
-          case DataType::F16:
-          {
-            pixelValue = PixelValue(static_cast<half>(-65504.0f));
-            break;
-          }
-          case DataType::QASYMM8:
-          {
-            pixelValue =
-                PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
-            break;
-          }
-          default:
-          {
-            ARM_COMPUTE_ERROR("Unsupported DataType");
-          }
-        }
-        break;
-      }
-      default:
-        ARM_COMPUTE_ERROR("Reduction Operation unsupported");
-    }
-    _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
-  }
-}
-
-void NEReductionOperationEx::run()
-{
-  if (_reduction_axis == 0)
-  {
-    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
-  }
-  NEScheduler::get().schedule(&_reduction_kernel, _window_split);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
deleted file mode 100644
index aa165cc15..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/UtilsEx.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-
-NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _conv_f(),
-      _upsample_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _weights_flipped(),
-      _flip_axis(),
-      _original_weights(nullptr),
-      _input(nullptr),
-      _info(),
-      _is_prepared(false)
-{
-}
-
-Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                                      const ITensorInfo *bias, const ITensorInfo *output,
-                                      const PadStrideInfo &info, unsigned int invalid_right,
-                                      unsigned int invalid_bottom)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
-                                                       DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
-  const unsigned int width_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-  const unsigned int height_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
-
-  auto out_dims = transposeconv_output_dimensions(
-      input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
-      weights->dimension(height_idx), info, invalid_right, invalid_bottom);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-  if (bias != nullptr)
-  {
-    if (is_data_type_quantized_asymmetric(input->data_type()))
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-    }
-    else
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-    }
-  }
-
-  if (output->tensor_shape().total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
-                                    "Output's width is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
-                                    "Output's height is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
-                                    "Output's depth is invalid.");
-  }
-
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
-  TensorInfo scale_out_info(
-      input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-  const unsigned int batches_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-  const unsigned int channel_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
-                              scale_out_info.dimension(batches_idx));
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
-                              scale_out_info.dimension(channel_idx));
-
-  ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, WeightsInfo()));
-
-  return Status{};
-}
-
-void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias,
-                                     ITensor *output, const PadStrideInfo &info,
-                                     unsigned int invalid_right, unsigned int invalid_bottom)
-{
-  // Perform validation step
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
-      input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
-
-  const DataLayout data_layout = input->info()->data_layout();
-  const unsigned int width_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const unsigned int height_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(width_idx), input->info()->dimension(height_idx),
-      weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
-      invalid_right, invalid_bottom);
-
-  const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
-
-  _input = input;
-  _original_weights = weights;
-  _info = info;
-  _is_prepared = false;
-
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
-
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
-                     input->info()->quantization_info());
-
-  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
-  _memory_group.manage(&_scaled_output);
-
-  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-  _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
-
-  // setup the function to convolve the upscaled output
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
-
-  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                    DimensionRoundingType::FLOOR);
-
-  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
-                            input->info()->quantization_info());
-  scale_out_info.set_data_layout(data_layout);
-  _scaled_output.allocator()->init(scale_out_info);
-
-  _upsample_f.configure(input, &_scaled_output, upsample_info);
-
-  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
-
-  // Setup flip axis data
-  _flip_axis.allocator()->allocate();
-  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
-  axis_data[0] = static_cast<uint32_t>(width_idx);
-  axis_data[1] = static_cast<uint32_t>(height_idx);
-
-  _scaled_output.allocator()->allocate();
-}
-
-void NETransposeConvLayer::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  _upsample_f.run();
-  _conv_f.run();
-}
-
-void NETransposeConvLayer::prepare()
-{
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    // Run weights flipping and mark original weights tensor as unused
-    _weights_flipped.allocator()->allocate();
-    _flip_weights.run();
-    _original_weights->mark_as_unused();
-
-    // Prepare convolution
-    _conv_f.prepare();
-
-    _is_prepared = true;
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h
deleted file mode 100644
index f94effea1..000000000
--- a/compute/ARMComputeEx/src/runtime/topk_v2.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file topk_v2.h
- * @brief This file contains TopK method and TopContainer class for TopK operation
- * @ingroup COM_AI_RUNTIME
- */
-
-#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
-#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
-
-typedef int32_t int32;
-
-namespace nnfw
-{
-namespace rt
-{
-namespace optimized_ops
-{
-/**
- * @brief class to define TopK operation
- * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
- * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
- * TFLite.
- * (TFLite additionaly supports kTfLiteInt64.)
- *
- * The class that collects top indexes of k values. Based on template
- * tensorflow::gtl::TopN<> but, for optimization,
- * it re-uses the same container.
- */
-template <typename T> class TopContainer
-{
-public:
-  /**
-   * @brief Prevent default constructor of of this class
-   */
-  TopContainer() = delete;
-  /**
-   * @brief Constructor with params
-   * @param [in] row_size Size of row in data
-   * @param [in] k The top k predictions
-   */
-  TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
-  {
-    container_.reserve(std::min(k, row_size) + 1);
-  }
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   * @param [in] topContainer To copy
-   */
-  TopContainer(const TopContainer &) = delete;
-  /*
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   * @param [in] topContainer To copy
-   * @return Reference of TopContainer
-   */
-  TopContainer &operator=(const TopContainer &) = delete;
-
-  /**
-   * @brief Start collecting
-   * @param [in] values To set as values
-   * @return N/A
-   */
-  void start_collecting(const T *values)
-  {
-    values_ = values;
-    container_.clear();
-  }
-
-  /**
-   * @brief Push a value to be compared for topk
-   * @param [in] a A value to compare
-   * @return N/A
-   */
-  void push(int32 a)
-  {
-    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
-    if (container_.size() <= (size_t)k_)
-    {
-      container_.push_back(a);
-      if (container_.size() == (size_t)(k_ + 1))
-      {
-        std::make_heap(container_.begin(), container_.end(), comparator);
-        std::pop_heap(container_.begin(), container_.end(), comparator);
-      }
-    }
-    else if (comparator(a, container_.front()))
-    {
-      container_.back() = a;
-      std::push_heap(container_.begin(), container_.end(), comparator);
-      std::pop_heap(container_.begin(), container_.end(), comparator);
-    }
-  }
-
-  /**
-   * @brief Get sorted result from pushed values
-   * @return Reference of vector with sorted values
-   */
-  const std::vector<int32> &sorted_result()
-  {
-    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
-    if (container_.size() <= (size_t)(k_))
-    {
-      std::sort(container_.begin(), container_.end(), comparator);
-    }
-    else
-    {
-      std::sort_heap(container_.begin(), container_.end() - 1, comparator);
-      container_.resize(k_);
-    }
-    return container_;
-  }
-
-private:
-  int32 k_;
-  std::vector<int32> container_;
-  const T *values_ = nullptr;
-
-  bool compare_fun(int32 a, int32 b) const
-  {
-    if (values_[b] < values_[a])
-    {
-      return true;
-    }
-    else if (values_[b] > values_[a])
-    {
-      return false;
-    }
-    else
-    {
-      return a < b;
-    }
-  }
-};
-
-/**
- * @brief Operates TopK operation with params
- * @param [in] row_size Size of row in data
- * @param [in] num_rows The number of rows in data
- * @param [in] data To be operated in
- * @param [in] k The top k predictions
- * @param [out] output_indexes Indexes of targets in the top k predictions
- * @param [out] output_values Values of targets in the top k predictions
- * @return N/A
- */
-template <typename T>
-void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
-          T *output_values)
-{
-  TopContainer<T> topc(k, row_size);
-  for (int row = 0; row < num_rows; ++row)
-  {
-    const T *values_row = data + row * row_size;
-    topc.start_collecting(values_row);
-    for (int32 c = 0; c < row_size; ++c)
-    {
-      topc.push(c);
-    }
-
-    // Prepare output buffers.
-    int32 *indexes_row = output_indexes + row * k;
-    T *output_row = output_values + row * k;
-    // We always assume that the output is sorted.
-    const auto &top_k = topc.sorted_result();
-    std::copy(top_k.begin(), top_k.end(), indexes_row);
-    std::transform(top_k.begin(), top_k.end(), output_row,
-                   [values_row](const int32 loc) { return values_row[loc]; });
-  }
-}
-
-} // namespace optimized_ops
-} // namespace rt
-} // namespace nnfw
-
-#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
author	Chunseok Lee <chunseok.lee@samsung.com>	2020-10-29 13:12:50 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2020-10-29 13:12:50 +0900
commit	d6b371e095d737922187a518b8faba1ef6f3a2b1 (patch)
tree	9d90c09c887b5111389dbedf924f59206411cd5a /compute/ARMComputeEx
parent	c55f8a6db48cda9d3a78048338b7f18c4cca62b8 (diff)
download	nnfw-d6b371e095d737922187a518b8faba1ef6f3a2b1.tar.gz nnfw-d6b371e095d737922187a518b8faba1ef6f3a2b1.tar.bz2 nnfw-d6b371e095d737922187a518b8faba1ef6f3a2b1.zip