diff options
Diffstat (limited to 'libs/ARMComputeEx')
142 files changed, 0 insertions, 15042 deletions
diff --git a/libs/ARMComputeEx/CMakeLists.txt b/libs/ARMComputeEx/CMakeLists.txt deleted file mode 100644 index 2483fb55d..000000000 --- a/libs/ARMComputeEx/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -if("${TARGET_ARCH}" STREQUAL "x86_64") - return() -endif() - -nnfw_find_package(ARMCompute REQUIRED) - -set(ACL_EX_BASE ${CMAKE_SOURCE_DIR}/libs/ARMComputeEx) - -file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp") - -# generate embeded cl_kernel -execute_process ( - WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/libs/ARMComputeEx" - COMMAND bash -c "python resolve_includes.py" -) - -add_library(arm_compute_ex SHARED ${ACL_EX_SRCS}) -set_target_properties(arm_compute_ex PROPERTIES COMPILE_FLAGS "-DEMBEDDED_KERNELS=1") -target_include_directories(arm_compute_ex PUBLIC ${CMAKE_SOURCE_DIR}/libs/ARMComputeEx) -target_link_libraries(arm_compute_ex arm_compute_core) -install(TARGETS arm_compute_ex DESTINATION lib) diff --git a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h deleted file mode 100644 index e4e752ef9..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLKernelLibraryEx.h - * @ingroup COM_AI_RUNTIME - * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines - * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL. - */ - -#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ -#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ - -#include "arm_compute/core/CL/OpenCL.h" - -#include <map> -#include <set> -#include <string> -#include <utility> - -namespace arm_compute -{ - -/** - * @brief Class to build OpenCL kernels added from nnfw - * */ -class CLKernelLibraryEx -{ - using StringSet = std::set<std::string>; - -private: - /** - * @brief Construct a new CLKernelLibraryEx object - */ - CLKernelLibraryEx(); - -public: - /** - * @brief Prevent instances of this class from being copied. - */ - CLKernelLibraryEx(const CLKernelLibraryEx &) = delete; - - /** - * @brief Prevent instances of this class from being copied. - */ - const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete; - - /** - * @brief Get the KernelLibrary singleton. - * @return The KernelLibrary instance - */ - static CLKernelLibraryEx &get(); - - /** - * @brief Initialise the kernel library. - * @param[in] kernel_path Path of the directory from which kernel sources are loaded. - * @param[in] context CL context used to create programs. - * @param[in] device CL device for which the programs are created. - * @return N/A - */ - void init(std::string kernel_path, cl::Context context, cl::Device device) - { - _kernel_path = std::move(kernel_path); - _context = std::move(context); - _device = std::move(device); - } - - /** - * @brief Set the path that the kernels reside in. - * @param[in] kernel_path Path of the directory from which kernel sources are loaded. - * @return N/A - */ - void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; }; - - /** - * @brief Get the path that the kernels reside in. - * @return the path of kernel files - */ - std::string get_kernel_path() { return _kernel_path; }; - - /** - * @brief Get the source of the selected program. - * @param[in] program_name Program name. - * @return Source of the selected program. - */ - std::string get_program_source(const std::string &program_name); - - /** - * @brief Set the CL context used to create programs. - * @note Setting the context also resets the device to the - * first one available in the new context. - * @param[in] context A CL context. - * @return N/A - */ - void set_context(cl::Context context) - { - _context = std::move(context); - if (_context.get() == nullptr) - { - _device = cl::Device(); - } - else - { - const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>(); - - if (cl_devices.empty()) - { - _device = cl::Device(); - } - else - { - _device = cl_devices[0]; - } - } - } - - /** - * @brief Return associated CL context. - * @return A CL context. - */ - cl::Context &context() { return _context; } - - /** - * @brief Set the CL device for which the programs are created. - * @param[in] device A CL device. - * @return N/A - */ - void set_device(cl::Device device) { _device = std::move(device); } - - /** - * @brief Gets the CL device for which the programs are created. - * @return A CL device. - */ - cl::Device &get_device() { return _device; } - - /** - * @brief Return the device version - * @return The content of CL_DEVICE_VERSION - */ - std::string get_device_version(); - - /** - * @brief Create a kernel from the kernel library. - * @param[in] kernel_name Kernel name. - * @param[in] build_options_set Kernel build options as a set. - * @return The created kernel. - */ - Kernel create_kernel(const std::string &kernel_name, - const StringSet &build_options_set = {}) const; - - /** - * @brief Find the maximum number of local work items in a workgroup can be supported for the - * kernel. - * @param[in] kernel kernel object - */ - - size_t max_local_workgroup_size(const cl::Kernel &kernel) const; - /** - * @brief Return the default NDRange for the device. - * @return default NDRangeof the device - */ - cl::NDRange default_ndrange() const; - - /** - * @brief Clear the library's cache of binary programs - * @return N/A - */ - void clear_programs_cache() - { - _programs_map.clear(); - _built_programs_map.clear(); - } - - /** - * @brief Access the cache of built OpenCL programs - * @return program map data structure of which key is name of kernel and value is - * kerel source name. (*.cl) - */ - const std::map<std::string, cl::Program> &get_built_programs() const - { - return _built_programs_map; - } - - /** - * @brief Add a new built program to the cache - * @param[in] built_program_name Name of the program - * @param[in] program Built program to add to the cache - * @return N/A - */ - void add_built_program(const std::string &built_program_name, cl::Program program); - - /** - * @brief Returns true if FP16 is supported by the CL device - * @return true if the CL device supports FP16 - */ - bool fp16_supported() const; - - /** - * @brief Returns true if int64_base_atomics extension is supported by the CL device - * @return true if the CL device supports int64_base_atomics extension - */ - bool int64_base_atomics_supported() const; - -private: - /** - * @brief Load program and its dependencies. - * @param[in] program_name Name of the program to load. - */ - const Program &load_program(const std::string &program_name) const; - /** - * @brief Concatenates contents of a set into a single string. - * @param[in] s Input set to concatenate. - * @return Concatenated string. - */ - std::string stringify_set(const StringSet &s) const; - - cl::Context _context; /**< Underlying CL context. */ - cl::Device _device; /**< Underlying CL device. */ - std::string _kernel_path; /**< Path to the kernels folder. */ - mutable std::map<std::string, const Program> - _programs_map; /**< Map with all already loaded program data. */ - mutable std::map<std::string, cl::Program> - _built_programs_map; /**< Map with all already built program data. */ - static const std::map<std::string, std::string> - _kernel_program_map; /**< Map that associates kernel names with programs. */ - static const std::map<std::string, std::string> - _program_source_map; /**< Contains sources for all programs. - Used for compile-time kernel inclusion. >*/ -}; -} -#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h deleted file mode 100644 index dbda354d6..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_OPENCLEX_H__ -#define __ARM_COMPUTE_OPENCLEX_H__ - -#include <string> -#include <utility> - -/* Configure the Khronos C++ wrapper to target OpenCL 1.2: */ -#ifndef ARM_COMPUTE_NO_EXCEPTIONS -#define CL_HPP_ENABLE_EXCEPTIONS -#endif // ARM_COMPUTE_NO_EXCEPTIONS -#define CL_HPP_CL_1_2_DEFAULT_BUILD -#define CL_HPP_TARGET_OPENCL_VERSION 110 -#define CL_HPP_MINIMUM_OPENCL_VERSION 110 -#include <CL/cl2.hpp> - -namespace arm_compute -{ -/** Class for loading OpenCL symbols. */ -class CLSymbolsEx final -{ -private: - CLSymbolsEx() = default; - void load_symbols(void *handle); - -public: - /** Get the static instance of CLSymbols. - * - * @return The static instance of CLSymbols. - */ - static CLSymbolsEx &get(); - /** Load symbols from the given OpenCL library path. - * - * @param[in] library Path to the OpenCL library. - * - * @return True if loading the library is successful. - */ - bool load(const std::string &library); - /** Load symbols from any of the default OpenCL library names. - * - * @return True if loading any library is successful. - */ - bool load_default(); - -#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr - - DECLARE_FUNCTION_PTR(clGetEventInfo); - DECLARE_FUNCTION_PTR(clSetEventCallback); - -#undef DECLARE_FUNCTION_PTR - -private: - std::pair<bool, bool> _loaded{false, false}; -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_OPENCLEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h deleted file mode 100644 index 080cc47ef..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ -#define __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the activation layer kernel. */ -class CLActivationLayerExKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLActivationLayerExKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLActivationLayerExKernel(const CLActivationLayerExKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLActivationLayerExKernel &operator=(const CLActivationLayerExKernel &) = delete; - /** Allow instances of this class to be moved */ - CLActivationLayerExKernel(CLActivationLayerExKernel &&) = default; - /** Allow instances of this class to be moved */ - CLActivationLayerExKernel &operator=(CLActivationLayerExKernel &&) = default; - /** Default destructor */ - ~CLActivationLayerExKernel() = default; - /** Set the input and output tensor. - * - * @note If the output tensor is a nullptr, the activation function will be performed in-place - * - * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will - * store the result - * of the activation function. Data types supported: - * QASYMM8/F16/F32. - * @param[out] output Destination tensor. Data type supported: same as @p input - * @param[in] act_info Activation layer information. - */ - void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLActivationLayerKernel - * - * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor - * will store the result - * of the activation function. Data types supported: QASYMM8/F16/F32. - * @param[in] output Destination tensor info. Data type supported: same as @p input - * @param[in] act_info Activation layer information. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfoEx &act_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - ICLTensor *_input; - ICLTensor *_output; - bool _run_in_place; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h deleted file mode 100644 index b91a26159..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLArgMinMaxKernel.h - * @brief This file defines CLArgMinMaxKernel - * @ingroup COM_AI_RUNTIME - */ - -#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__ -#define __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to define interface for the argminmax max kernel. - */ -class CLArgMinMaxKernel : public ICLKernel -{ -public: - /** - * @brief Default constructor. - */ - CLArgMinMaxKernel(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied - */ - CLArgMinMaxKernel(const CLArgMinMaxKernel &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied - * @return Reference of this instance - */ - CLArgMinMaxKernel &operator=(const CLArgMinMaxKernel &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved - */ - CLArgMinMaxKernel(CLArgMinMaxKernel &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved - * @return Reference of this instance - */ - CLArgMinMaxKernel &operator=(CLArgMinMaxKernel &&) = default; - /** - * @brief Initialise the kernel's input, output and border mode. - * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. - * @param[out] output The output tensor, Data types supported: same as @p input. - * @param[in] argminmax_axis Axis to argminmax - * return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, const uint32_t argminmax_axis, - ArgOperation op); - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLArgMinMaxKernel - * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32. - * @param[in] output The output tensor info, Data types supported: same as @p input1. - * @param[in] argminmax_axis Axis to argminmax - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t argminmax_axis, ArgOperation op); - - /* - * @brief Run CLArgMinMaxKernel op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - /* - * @brief Run CLArgMinMaxKernel op on CPU - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run_on_cpu(cl::CommandQueue &queue); - -private: - const ICLTensor *_input; - ICLTensor *_output; - uint32_t _argminmax_axis; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLargminmaxMAXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h deleted file mode 100644 index 9a765f310..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ -#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the arithmetic subtraction kernel (support broadcasting) - * - * Arithmetic subtraction is computed by: - * @f[ output(x,y) = input1(x,y) - input2(x,y) @f] - */ -class CLArithmeticSubtractionExKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLArithmeticSubtractionExKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLArithmeticSubtractionExKernel(const CLArithmeticSubtractionExKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLArithmeticSubtractionExKernel &operator=(const CLArithmeticSubtractionExKernel &) = delete; - /** Allow instances of this class to be moved */ - CLArithmeticSubtractionExKernel(CLArithmeticSubtractionExKernel &&) = default; - /** Allow instances of this class to be moved */ - CLArithmeticSubtractionExKernel &operator=(CLArithmeticSubtractionExKernel &&) = default; - /** Default destructor */ - ~CLArithmeticSubtractionExKernel() = default; - - /** Initialise the kernel's inputs, output and convertion policy. - * - * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32. - * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32. - * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), - * S16/F16/F32. - * @param[in] policy Policy to use to handle overflow. - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, - ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLArithmeticSubtractionExKernel - * - * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32. - * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32. - * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), - * S16/F16/F32. - * @param[in] policy Policy to use to handle overflow. - * - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, ConvertPolicy policy); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input1; /**< Source tensor 1 */ - const ICLTensor *_input2; /**< Source tensor 2 */ - ICLTensor *_output; /**< Destination tensor */ -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h deleted file mode 100644 index 1387897c9..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__ -#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform BATCH_TO_SPACE_ND operation */ -class CLBatchToSpaceNDKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLBatchToSpaceNDKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLBatchToSpaceNDKernel(const CLBatchToSpaceNDKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLBatchToSpaceNDKernel &operator=(const CLBatchToSpaceNDKernel &) = delete; - /** Allow instances of this class to be moved */ - CLBatchToSpaceNDKernel(CLBatchToSpaceNDKernel &&) = default; - /** Allow instances of this class to be moved */ - CLBatchToSpaceNDKernel &operator=(CLBatchToSpaceNDKernel &&) = default; - /** Default destructor */ - ~CLBatchToSpaceNDKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - */ - void configure(const ICLTensor *input, ICLTensor *output, const int32_t *block_size); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - ICLTensor *_output; /**< Destination tensor */ -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h deleted file mode 100644 index ab33d9d3a..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ -#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/ -class CLBinaryLogicalOpKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLBinaryLogicalOpKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete; - /** Allow instances of this class to be moved */ - CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default; - /** Allow instances of this class to be moved */ - CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default; - /** Initialize the kernel's input, output. - * - * @param[in] input1 Source tensor1. - * @param[in] input2 Source tensor2. - * @param[out] output Output tensor. - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, - BinaryLogicalOperation op); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - - BorderSize border_size() const override; - -private: - const ICLTensor *_input1; - const ICLTensor *_input2; - ICLTensor *_output; -}; - -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h deleted file mode 100644 index 4c2feb903..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLCastKernel.h - * @ingroup COM_AI_RUNTIME - * @brief This file defines CLCastKernel class - */ - -#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ -#define __ARM_COMPUTE_CLCASTKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to define OpenCL kernel for cast operation - */ -class CLCastKernel : public ICLKernel -{ -public: - /** - * @brief Construct CLCastKernel object - */ - CLCastKernel(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLCastKernel(const CLCastKernel &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLCastKernel &operator=(const CLCastKernel &) = delete; - - /** - * @brief Construct CLCastKernel object using default move constructor - * @param[in] CLCastKernel object to move - */ - CLCastKernel(CLCastKernel &&) = default; - - /** - * @brief Allow instances of this class to be moved - * @param[in] CLCastKernel object to move - */ - CLCastKernel &operator=(CLCastKernel &&) = default; - - /** - * @brief Destruct this CLCastKernel object - */ - ~CLCastKernel() = default; - - /** - * @brief Initialise the kernel's input and output. - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output); - - /** - * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command - * queue. - * @note The queue is *not* flushed by this method, and therefore the kernel will not have - * been executed by the time this method returns. - * @param[in] window Region on which to execute the kernel. (Must be a valid region of - * the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - ICLTensor *_output; /**< Destination tensor */ -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h deleted file mode 100644 index f5f455993..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ -#define __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to check if values in both tensors are equal*/ -class CLComparisonOpKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLComparisonOpKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLComparisonOpKernel(const CLComparisonOpKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLComparisonOpKernel &operator=(const CLComparisonOpKernel &) = delete; - /** Allow instances of this class to be moved */ - CLComparisonOpKernel(CLComparisonOpKernel &&) = default; - /** Allow instances of this class to be moved */ - CLComparisonOpKernel &operator=(CLComparisonOpKernel &&) = default; - /** Initialize the kernel's input, output. - * - * @param[in] input1 Source tensor1. - * @param[in] input2 Source tensor2. - * @param[out] output Output tensor. - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, - const ComparisonOperation &op); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - - BorderSize border_size() const override; - -private: - const ICLTensor *_input1; - const ICLTensor *_input2; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h deleted file mode 100644 index 60ec7a82a..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ -#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform depthTospace operation */ -class CLDepthToSpaceKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLDepthToSpaceKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete; - /** Allow instances of this class to be moved */ - CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default; - /** Allow instances of this class to be moved */ - CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default; - /** Default destructor */ - ~CLDepthToSpaceKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - */ - void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - ICLTensor *_output; /**< Destination tensor */ -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h deleted file mode 100644 index da075db69..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLEmbeddingLookupKernel.h - * @ingroup COM_AI_RUNTIME - * @brief This file defines CLEmbeddingLookupKernel class - */ - -#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ -#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** -* @brief Class to perform EmbeddingLookup operation with opencl kernel -*/ -class CLEmbeddingLookupKernel : public ICLKernel -{ -public: - /** - * @brief Construct a CLEmbeddingLookupKernel object - * */ - CLEmbeddingLookupKernel(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - * */ - CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - * */ - CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete; - - /** - * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor - * @param[in] CLEmbeddingLookupKernel object to move - * */ - CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default; - - /** - * @brief Move assignment operator - * @param[in] CLEmbeddingLookupKernel object to move - * */ - CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default; - - /** - * @brief Destruct this object - * */ - ~CLEmbeddingLookupKernel() = default; - - /** - * @brief Set the input and output of the kernel - * @param[in] input Source tensor. - * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[out] output Destination tensor. Data type supported: Same as @p input - * @param[in] lookups Lookups are 1D tensor that values are indices into the first - * dimension of input. - * Data types supported: S32. - * @return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLEmbeddingLookupKernel - * @param[in] input The input tensor info. - * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[in] output The output tensor info, Data types supported: same as @p input1. - * @param[in] lookups Lookups info. Data types supported: S32. - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *lookups); - - /** - * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command - * queue. - * @note The queue is *not* flushed by this method, and therefore the kernel will not have - * been executed by the time this method returns. - * @param[in] window Region on which to execute the kernel. (Must be a valid region of - * the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /** Source tensor */ - ICLTensor *_output; /** Destination tensor */ - const ICLTensor *_lookups; /** Lookups tensor */ -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h deleted file mode 100644 index a6ea539f8..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLEXPKERNEL_H__ -#define __ARM_COMPUTE_CLEXPKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform an exponential operation */ -class CLExpKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLExpKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLExpKernel(const CLExpKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLExpKernel &operator=(const CLExpKernel &) = delete; - /** Allow instances of this class to be moved */ - CLExpKernel(CLExpKernel &&) = default; - /** Allow instances of this class to be moved */ - CLExpKernel &operator=(CLExpKernel &&) = default; - /** Default destructor */ - ~CLExpKernel() = default; - /** Set the source, destination of the kernel - * - * @param[in] input Source tensor. Data type supported: F32. - * @param[out] output Destination tensor. Data type supported: F32. - */ - void configure(const ICLTensor *input, ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLEXPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h deleted file mode 100644 index 7e35a80b0..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLGatherKernel.h - * @ingroup COM_AI_RUNTIME - * @brief This file defines CLGatherKernel class - */ - -#ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__ -#define __ARM_COMPUTE_CLGATHERKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to define an interface for the gather kernel. - */ -class CLGatherKernel : public ICLKernel -{ -public: - /** - * @brief Construct CLGatherKernel object - * */ - CLGatherKernel(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - */ - CLGatherKernel(const CLGatherKernel &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - */ - CLGatherKernel &operator=(const CLGatherKernel &) = delete; - - /** - * @brief Construct CLGatherKernel object by using default move constructor - * @param[in] CLGatherKernel object to move - */ - CLGatherKernel(CLGatherKernel &&) = default; - - /** - * @brief Move assignment operator - * @param[in] CLGatherKernel object to move - */ - CLGatherKernel &operator=(CLGatherKernel &&) = default; - - /** - * @brief Initialise the kernel's input, output and border mode. - * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. - * @param[in] input2 An input tensor. Data types supported: S32. - * @param[out] output The output tensor, Data types supported: same as @p input1. - * @return N/A - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLGatherKernel - * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. - * @param[in] input2 An input tensor. Data types supported: S32. - * @param[out] output The output tensor, Data types supported: same as @p input1. - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output); - - /** - * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command - * queue. - * @note The queue is *not* flushed by this method, and therefore the kernel will not have - * been executed by the time this method returns. - * @param[in] window Region on which to execute the kernel. (Must be a valid region of - * the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input1; - const ICLTensor *_input2; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLGATHERKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h deleted file mode 100644 index c3fc15637..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLHashtableLookupKernel.h - * @ingroup COM_AI_RUNTIME - * @brief This file defines CLHashtableLookupKernel class - */ - -#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ -#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/runtime/CL/CLTensor.h" - -namespace arm_compute -{ -class ICLTensor; - -/** -* @brief Class to perform HashtableLookup operation with opencl kernel -*/ -class CLHashtableLookupKernel : public ICLKernel -{ -public: - /** - * @brief Construct a CLHashtableLookupKernel object - * */ - CLHashtableLookupKernel(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - * */ - CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - * */ - CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete; - - /** - * @brief Construct a CLHashtableLookupKernel object by using default move constructor - * @param[in] CLHashtableLookupKernel object to move - * */ - CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default; - - /** - * @brief Move assignment operator - * @param[in] CLHashtableLookupKernel object to move - * */ - CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default; - - /** - * @brief Destruct this object - * */ - ~CLHashtableLookupKernel() = default; - - /** - * @brief Set the input and output of the kernel - * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of - * input. - * @param[in] keys Keys 1D tensor. keys and input pair represent a map. - * Data types supported: S32 - * @param[in] input Source tensor. - * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p - * input. - * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits - * (True) or not (False). Data types supported: U8/QASYMM8 - * @return N/A - */ - void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, - ICLTensor *output, ICLTensor *hits); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLHashtableLookupKernel - * @param[in] lookups The lookups tensor info. Data types supported: S32. - * @param[in] keys The keys tensor info. keys and input pair represent a map. - * Data types supported: S32 - * @param[in] input The input tensor info. - * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[out] output The output tensor. Data types and data layouts supported: Same as @p - * input. - * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup - * hits - * (True) or not (False). Data types supported: U8/QASYMM8 - * @return a status - */ - static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, - const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *hits); - - /** - * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command - * queue. - * @note The queue is *not* flushed by this method, and therefore the kernel will not have - * been executed by the time this method returns. - * @param[in] window Region on which to execute the kernel. (Must be a valid region of - * the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_lookups; /** Lookups tensor */ - const ICLTensor *_keys; /** Keys tensor */ - const ICLTensor *_input; /** Source tensor */ - ICLTensor *_output; /** Destination tensor */ - ICLTensor *_hits; /** Hits tensor */ - std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */ -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h deleted file mode 100644 index ccbea147e..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ -#define __ARM_COMPUTE_CLNEGKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform a negation operation on tensor*/ -class CLNegKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLNegKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLNegKernel(const CLNegKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLNegKernel &operator=(const CLNegKernel &) = delete; - /** Allow instances of this class to be moved */ - CLNegKernel(CLNegKernel &&) = default; - /** Allow instances of this class to be moved */ - CLNegKernel &operator=(CLNegKernel &&) = default; - /** Initialize the kernel's input, output. - * - * @param[in] input Source tensor. - * @param[out] output Destination tensor. - */ - void configure(const ICLTensor *input, ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h deleted file mode 100644 index 181a6226a..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ -#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the normalization layer kernel. - */ -class CLNormalizationLayerExKernel : public ICLKernel -{ -public: - /** Constructor */ - CLNormalizationLayerExKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLNormalizationLayerExKernel(const CLNormalizationLayerExKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLNormalizationLayerExKernel &operator=(const CLNormalizationLayerExKernel &) = delete; - /** Default Move Constructor. */ - CLNormalizationLayerExKernel(CLNormalizationLayerExKernel &&) = default; - /** Default move assignment operator */ - CLNormalizationLayerExKernel &operator=(CLNormalizationLayerExKernel &&) = default; - /** Set the input and output tensors. - * - * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions - * [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: - * F16/F32. - * @param[out] output Destination tensor. Output will have the same number of dimensions as - * input. Data types supported: same as @p input. - * @param[in] norm_info Normalization layer information like the normalization type, - * normalization size and other parameters. - */ - void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLNormalizationLayerKernel - * - * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions - * [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: - * F16/F32. - * @param[in] output Destination tensor. Output will have the same number of dimensions as - * input. Data types supported: same as @p input. - * @param[in] norm_info Normalization layer information like the normalization type, normalization - * size and other parameters. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - NormalizationLayerInfo norm_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - BorderSize _border_size; - bool _is_in_map; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h deleted file mode 100644 index eff1b8bd5..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__ -#define __ARM_COMPUTE_CLPRELU_KERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to calculate PReLU*/ -class CLPReLUKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLPReLUKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLPReLUKernel(const CLPReLUKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLPReLUKernel &operator=(const CLPReLUKernel &) = delete; - /** Allow instances of this class to be moved */ - CLPReLUKernel(CLPReLUKernel &&) = default; - /** Allow instances of this class to be moved */ - CLPReLUKernel &operator=(CLPReLUKernel &&) = default; - /** Initialize the kernel's input, output. - * - * @param[in] input Source tensor1. - * @param[in] alpha Source tensor2. - * @param[out] output Output tensor. - */ - void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - - BorderSize border_size() const override; - -private: - const ICLTensor *_input; - const ICLTensor *_alpha; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h deleted file mode 100644 index cbaa2adee..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h +++ /dev/null @@ -1,60 +0,0 @@ -/* -* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved -* Copyright (c) 2016-2018 ARM Limited. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#ifndef __ARM_COMPUTE_CLPADLAYERKERNEL_H__ -#define __ARM_COMPUTE_CLPADLAYERKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform PAD operation */ -class CLPadLayerKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLPadLayerKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLPadLayerKernel(const CLPadLayerKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete; - /** Allow instances of this class to be moved */ - CLPadLayerKernel(CLPadLayerKernel &&) = default; - /** Allow instances of this class to be moved */ - CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default; - /** Default destructor */ - ~CLPadLayerKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] pad_size Padding Size tensor. Data types supported : S32 - */ - void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - ICLTensor *_output; /**< Destination tensor */ - ICLTensor *_pad_size; /**< Padding Size tensor */ -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLPADLAYERKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h deleted file mode 100644 index 3434deee8..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ -#define __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform tensor permutation. - * - * Permutes given a permutation vector - */ -class CLPermuteExKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLPermuteExKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLPermuteExKernel(const CLPermuteExKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLPermuteExKernel &operator=(const CLPermuteExKernel &) = delete; - /** Allow instances of this class to be moved */ - CLPermuteExKernel(CLPermuteExKernel &&) = default; - /** Allow instances of this class to be moved */ - CLPermuteExKernel &operator=(CLPermuteExKernel &&) = default; - /** Set the input and output of the kernel. - * - * @param[in] input The input tensor to permute. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] output The output tensor. Data types supported: Same as @p input - * @param[in] perm Permutation vector - */ - void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLPermuteKernel - * - * @param[in] input First tensor input info. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] output Output tensor info. Data types supported: same as @p input. - * @param[in] perm Permutation vector - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const PermutationVector &perm); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - PermutationVector _perm; -}; -} // arm_compute -#endif /*__ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h deleted file mode 100644 index d579f5d8f..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLPixelWiseDivisionKernel.h - * @ingroup COM_AI_RUNTIME - * @brief This file defines CLPixelWiseDivisionKernel class - */ - -#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ -#define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Interface for the pixelwise division kernel. - */ -class CLPixelWiseDivisionKernel : public ICLKernel -{ -public: - /** - * @brief Construct a CLPixelWiseDivisionKernel object - */ - CLPixelWiseDivisionKernel(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - */ - CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - */ - CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete; - - /** - * @brief Construct a CLPixelWiseDivisionKernel object by using move constructor - * @param[in] CLPixelWiseDivisionKernel object to move - */ - CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default; - - /** - * @brief Allow instances of this class to be moved - * @param[in] CLPixelWiseDivisionKernel object to move - */ - CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default; - - /** - * @brief Initialise the kernel's input, output and border mode. - * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32. - * @param[in] input2 An input tensor. Data types supported: same as @p input1. - * @param[out] output The output tensor, Data types supported: same as @p input1. Note: - * U8 requires both inputs to be U8. - * @param[in] scale Scale to apply after division. - * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest - * even. - * @return N/A - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLPixelWiseDivisionKernel - * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32. - * @param[in] input2 An input tensor info. Data types supported: same as @p input1. - * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 requires both inputs to be U8. - * @param[in] scale Scale to apply after division. - * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy); - - /** - * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command - * queue. - * @note The queue is *not* flushed by this method, and therefore the kernel will not have - * been executed by the time this method returns. - * @param[in] window Region on which to execute the kernel. (Must be a valid region of - * the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - - /** - * @brief The size of the border for that kernel - * @return The width in number of elements of the border. - */ - BorderSize border_size() const override; - -private: - const ICLTensor *_input1; - const ICLTensor *_input2; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h deleted file mode 100644 index a26a4a7fc..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLReduceOperationKernel.h - * @brief This file defines CLReduceOperationKernel class - * @ingroup COM_AI_RUNTIME - */ - -#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ -#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to define interface for the reduce operation kernel - */ -class CLReduceOperationKernel : public ICLKernel -{ -public: - /** - * @brief Default constructor - */ - CLReduceOperationKernel(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLReduceOperationKernel(const CLReduceOperationKernel &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete; - /** - * @brief Allow instances of this class to be moved - */ - CLReduceOperationKernel(CLReduceOperationKernel &&) = default; - /** - * @brief Allow instances of this class to be moved - */ - CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default; - /** - * @brief Default destructor - */ - ~CLReduceOperationKernel() = default; - - /** - * @brief Set the input and output tensors. - * @param[in] input Source tensor. Data types supported: U8/S32/F32. - * @param[out] output Destination tensor. Data types supported: Same as @p input. - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. - * @param[in] op Reduce operation to perform. - * @return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, - ReduceOperation op); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLReduceOperationKernel. - * @param[in] input Source tensor info. Data types supported: U8/S32/F32. - * @param[in] output Destination tensor info. Data types supported: Same as @p input. - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. - * @param[in] op Reduce operation to perform. - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, - ReduceOperation op); - - /* - * @brief Run CLReduceOperationKernel op - * @param[in] window Window to be used for in_slice - * @param[in] queue CLQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - uint32_t _axis; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h deleted file mode 100644 index 68534f1ab..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ -#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */ -class CLSpaceToBatchNDKernel final : public ICLKernel -{ -public: - /** Default constructor */ - CLSpaceToBatchNDKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete; - /** Allow instances of this class to be moved */ - CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default; - /** Allow instances of this class to be moved */ - CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default; - /** Default destructor */ - ~CLSpaceToBatchNDKernel() = default; - /** Initialise the kernel's input and output. - * - * @note The data layout of input and output must be the same. - * @note The number of dimensions of input and output must be 4, and `spatial` dimensions - * are height and width. - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. - * Data layout supported: NCHW/NHWC - * @param[in] block_size Block size tensor. Data types supported: S32. - * @param[in] padding_size Padding size tensor. Data types supported: S32. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. - * Data layout supported: NCHW/NHWC - */ - void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, - ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - const ICLTensor *_block_size; /**< Block size tensor */ - const ICLTensor *_padding_size; /**< Padding size tensor */ - ICLTensor *_output; /**< Destination tensor */ -}; - -} // namespace arm_compute - -#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h deleted file mode 100644 index be845a549..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ -#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform spaceTodepth operation */ -class CLSpaceToDepthKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLSpaceToDepthKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete; - /** Allow instances of this class to be moved */ - CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default; - /** Allow instances of this class to be moved */ - CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default; - /** Default destructor */ - ~CLSpaceToDepthKernel() = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - */ - void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor */ - ICLTensor *_output; /**< Destination tensor */ -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h deleted file mode 100644 index a4c44e35d..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ -#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to return squared difference value of two tensors (x-y)^2*/ -class CLSquaredDifferenceKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLSquaredDifferenceKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLSquaredDifferenceKernel(const CLSquaredDifferenceKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLSquaredDifferenceKernel &operator=(const CLSquaredDifferenceKernel &) = delete; - /** Allow instances of this class to be moved */ - CLSquaredDifferenceKernel(CLSquaredDifferenceKernel &&) = default; - /** Allow instances of this class to be moved */ - CLSquaredDifferenceKernel &operator=(CLSquaredDifferenceKernel &&) = default; - /** Initialize the kernel's input, output. - * - * @param[in] input1 Source tensor1. - * @param[in] input2 Source tensor2. - * @param[out] output Output tensor. - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - - BorderSize border_size() const override; - -private: - const ICLTensor *_input1; - const ICLTensor *_input2; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h deleted file mode 100644 index 6368c380e..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLStridedSliceExKernel.h - * @ingroup COM_AI_RUNTIME - * @brief This file defines CLStridedSliceExKernel class - */ - -#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ -#define __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** -* @brief Class to define an interface for the kernel to extract a strided slice of a tensor -*/ -class CLStridedSliceExKernel : public ICLKernel -{ -public: - /** - * @brief Construct a CLStridedSliceExKernel object - * */ - CLStridedSliceExKernel(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - * */ - CLStridedSliceExKernel(const CLStridedSliceExKernel &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - * */ - CLStridedSliceExKernel &operator=(const CLStridedSliceExKernel &) = delete; - - /** - * @brief Construct a CLStridedSliceExKernel object by using default move constructor - * @param[in] CLStridedSliceExKernel object to move - * */ - CLStridedSliceExKernel(CLStridedSliceExKernel &&) = default; - - /** - * @brief Move assignment operator - * @param[in] CLStridedSliceExKernel object to move - * */ - CLStridedSliceExKernel &operator=(CLStridedSliceExKernel &&) = default; - - /** - * @brief Destruct this object - * */ - ~CLStridedSliceExKernel() = default; - - /** - * @brief Set the input and output of the kernel - * @param[in] input Source tensor. Data type supported: - * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[out] output Destination tensor. Data type supported: Same as @p input - * @param[in] beginData The begin tensor. Data types supported: S32. - * The number of dimensions must be 1. - * The length must be the same as the number of dimensions of input. - * @param[in] endData The end tensor. Data types supported: S32. - * The number of dimensions must be 1. - * The length must be the same as the number of dimensions of input. - * @param[in] strideData The stride tensor. Data types supported: S32. - * The number of dimensions must be 1. - * The length must be the same as the number of dimensions of input. - * @param[in] beginMask Mask for begin - * @param[in] endMask Mask for end - * @param[in] shrinkAxisMask Mask for shrink axis. - * @return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLStridedSliceExKernel - * @param[in] input The input tensor info. Data types supported: - * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[in] output The output tensor info, Data types supported: same as @p input1. - * @param[in] begin The begin tensor info. Data types supported: S32. - * The number of dimensions must be 1. - * The length must be the same as the number of dimensions of input. - * @param[in] end The end tensor info. Data types supported: S32. - * The number of dimensions must be 1. - * The length must be the same as the number of dimensions of input. - * @param[in] stride The stride tensor info. Data types supported: S32. - * The number of dimensions must be 1. - * The length must be the same as the number of dimensions of input. - * @param[in] beginMask Mask for begin - * @param[in] endMask Mask for end - * @param[in] shrinkAxisMask Mask for shrink axis. - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *begin, const ITensorInfo *end, - const ITensorInfo *stride, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask); - - /** - * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command - * queue. - * @note The queue is *not* flushed by this method, and therefore the kernel will not have - * been executed by the time this method returns. - * @param[in] window Region on which to execute the kernel. (Must be a valid region of - * the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /** Source tensor */ - ICLTensor *_output; /** Destination tensor */ - ICLTensor *_beginData; /** Start indices of input tensor */ - ICLTensor *_endData; /** Stop indices of input tensor */ - ICLTensor *_stridesData; /** Strides tensor */ - int32_t _beginMask; /** Begin mask */ - int32_t _endMask; /** End mask */ - int32_t _shrinkAxisMask; /** Shrink axis mask */ -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h deleted file mode 100644 index eb2bad254..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h +++ /dev/null @@ -1,653 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLTopKV2Kernel.h - * @brief This file defines classes for TopKV2Kernel - * @ingroup COM_AI_RUNTIME - */ - -#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ -#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" - -// these parameters can be changed -#define _ITEMS 16 // number of items in a group -#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS -#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram -#define PERMUT // store the final permutation -//////////////////////////////////////////////////////// - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to define CLTopKV2Single - */ -class CLTopKV2Single : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLTopKV2Single(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied - */ - CLTopKV2Single(const CLTopKV2Single &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied - * @return Reference of this instance - */ - CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved - */ - CLTopKV2Single(CLTopKV2Single &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved - * @return Reference of this instance - */ - CLTopKV2Single &operator=(CLTopKV2Single &&) = default; - - /** - * @brief Initialise kernel with params - * @param[in] input An input tensor - * @param[in] topk_values Values of the top k predictions - * @param[in] topk_indices Indices of the top k predictions - * @param[in] indices Indices - * @param[in] temp_stack Temp stack - * @param[in] k K of the top k predictions - * @param[in] n Number times to quick-sort - * return N/A - */ - void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, - cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n); - - /* - * @brief Run CLTopKV2Single op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - ICLTensor *_input; - ICLTensor *_topk_values; - ICLTensor *_topk_indices; -}; - -/** - * @brief Class to define CLTopKV2Init - */ -class CLTopKV2Init : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLTopKV2Init(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied - */ - CLTopKV2Init(const CLTopKV2Init &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied - * @return Reference of this instance - */ - CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved - */ - CLTopKV2Init(CLTopKV2Init &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved - * @return Reference of this instance - */ - CLTopKV2Init &operator=(CLTopKV2Init &&) = default; - - /** - * @brief Initialise kernel with params - * @param[in] input An input tensor - * @param[in] in_key_buf Buffer of input key - * @param[in] in_ind_buf Buffer of input index - * @param[in] n Number times to quick-sort - * return N/A - */ - void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n); - - /* - * @brief Run CLTopKV2Init op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - ICLTensor *_input; -}; - -/** - * @brief Class to define CLRadixSortHistogram - */ -class CLRadixSortHistogram : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLRadixSortHistogram(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied - */ - CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied - * @return Reference of this instance - */ - CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved - */ - CLRadixSortHistogram(CLRadixSortHistogram &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved - * @return Reference of this instance - */ - CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; - - /** - * @brief Initialise kernel with params - * @param[out] hist_buf Buffer of histogram - * @param[in] bits Number of bits to be used for radix sort - * @param[in] n Integer number size to sort - * return N/A - */ - void configure(cl::Buffer *hist_buf, int bits, int n); - - /** - * @brief Set pass - * @param[in] pass Passes made of in radix sort algorithm - * @param[in] in_key_buf Buffer of input key - * return N/A - */ - void setPass(int pass, cl::Buffer *in_key_buf) - { - _pass = pass; - _in_key_buf = in_key_buf; - } - - /* - * @brief Run CLRadixSortHistogram op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - int _pass; - cl::Buffer *_in_key_buf; -}; - -/** - * @brief Class to define CLRadixSortScanHistogram - */ -class CLRadixSortScanHistogram : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLRadixSortScanHistogram(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied - */ - CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied - * @return Reference of this instance - */ - CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved - */ - CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved - * @return Reference of this instance - */ - CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; - - /** - * @brief Initialise kernel with params - * @param[out] hist_buf Buffer of histogram - * @param[out] glob_sum_buf Buffer of global sum - * @param[in] bits Number of bits to be used for radix sort - * return N/A - */ - void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); - - /* - * @brief Run CLRadixSortScanHistogram op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; -}; - -/** - * @brief Class to define CLRadixSortGlobalScanHistogram - */ -class CLRadixSortGlobalScanHistogram : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLRadixSortGlobalScanHistogram(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied - */ - CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied - * @return Reference of this instance - */ - CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved - */ - CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved - * @return Reference of this instance - */ - CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; - - /** - * @brief Initialise kernel with params - * @param[out] glob_sum_buf Buffer of global sum - * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram - * @param[in] bits Number of bits to be used for radix sort - * return N/A - */ - void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits); - - /* - * @brief Run CLRadixSortGlobalScanHistogram op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; -}; - -/** - * @brief Class to define CLRadixSortPasteHistogram - */ -class CLRadixSortPasteHistogram : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLRadixSortPasteHistogram(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied - */ - CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied - * @return Reference of this instance - */ - CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved - */ - CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved - * @return Reference of this instance - */ - CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; - - /** - * @brief Initialise kernel with params - * @param[out] hist_buf Buffer of histogram - * @param[out] glob_sum_buf Buffer of global sum - * @param[in] bits Number of bits to be used for radix sort - * return N/A - */ - void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); - - /* - * @brief Run CLRadixSortPasteHistogram op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; -}; - -/** - * @brief Class to define CLRadixSortReorder - */ -class CLRadixSortReorder : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLRadixSortReorder(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied - */ - CLRadixSortReorder(const CLRadixSortReorder &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied - * @return Reference of this instance - */ - CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved - */ - CLRadixSortReorder(CLRadixSortReorder &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved - * @return Reference of this instance - */ - CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; - - /** - * @brief Initialise kernel with params - * @param[out] hist_buf Buffer of histogram - * @param[in] bits Number of bits to be used for radix sort - * @param[in] n Integer number size to sort - * return N/A - */ - void configure(cl::Buffer *hist_buf, int bits, int n); - - /** - * @brief Set pass - * @param[in] pass Passes made of in radix sort algorithm - * @param[in] in_key_buf Buffer of input key - * @param[out] out_key_buf Buffer of output key - * @param[in] in_ind_buf Buffer of input index - * @param[out] out_ind_buf Buffer of output index - * return N/A - */ - void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, - cl::Buffer *out_ind_buf) - { - _pass = pass; - _in_key_buf = in_key_buf; - _out_key_buf = out_key_buf; - _in_ind_buf = in_ind_buf; - _out_ind_buf = out_ind_buf; - } - /* - * @brief Run CLRadixSortReorder op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - int _pass; - cl::Buffer *_in_key_buf; - cl::Buffer *_out_key_buf; - cl::Buffer *_in_ind_buf; - cl::Buffer *_out_ind_buf; -}; - -/** - * @brief Class to define CLTopKV2FindFirstNegative - */ -class CLTopKV2FindFirstNegative : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLTopKV2FindFirstNegative(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied - */ - CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied - * @return Reference of this instance - */ - CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved - */ - CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved - * @return Reference of this instance - */ - CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; - - /** - * @brief Initialise kernel with params - * @param[out] first_negative_idx_buf Buffer of the first negative index - * @param[in] n Number times to find - * return N/A - */ - void configure(cl::Buffer *first_negative_idx_buf, int n); - - /** - * @brief Set output buffer - * @param[out] out_key_buf Buffer of output key - * return N/A - */ - void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; } - - /* - * @brief Run CLTopKV2FindFirstNegative op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - cl::Buffer *_out_key_buf; -}; - -/** - * @brief Class to define CLTopKV2ReorderNegatives - */ -class CLTopKV2ReorderNegatives : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLTopKV2ReorderNegatives(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied - */ - CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied - * @return Reference of this instance - */ - CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved - */ - CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved - * @return Reference of this instance - */ - CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; - - /** - * @brief Initialise kernel with params - * @param[out] first_negative_idx_buf Buffer of the first negative index - * @param[in] n Number times to find - * return N/A - */ - void configure(cl::Buffer *first_negative_idx_buf, int n); - - /** - * @brief Set buffers - * @param[in] in_key_buf Buffer of input key - * @param[out] out_key_buf Buffer of output key - * @param[in] in_ind_buf Buffer of input index - * @param[out] out_ind_buf Buffer of output index - * return N/A - */ - void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, - cl::Buffer *out_ind_buf) - { - _in_key_buf = in_key_buf; - _out_key_buf = out_key_buf; - _in_ind_buf = in_ind_buf; - _out_ind_buf = out_ind_buf; - } - - /* - * @brief Run CLTopKV2ReorderNegatives op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - cl::Buffer *_in_key_buf; - cl::Buffer *_out_key_buf; - cl::Buffer *_in_ind_buf; - cl::Buffer *_out_ind_buf; -}; - -/** - * @brief Class to define CLTopKV2Store - */ -class CLTopKV2Store : public ICLKernel -{ -public: - /** - * @brief Constructor - */ - CLTopKV2Store(); - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied - */ - CLTopKV2Store(const CLTopKV2Store &) = delete; - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers). - * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied - * @return Reference of this instance - */ - CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved - */ - CLTopKV2Store(CLTopKV2Store &&) = default; - /** - * @brief Allow instances of this class to be moved - * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved - * @return Reference of this instance - */ - CLTopKV2Store &operator=(CLTopKV2Store &&) = default; - - /** - * @brief Initialise kernel with params - * @param[out] values Values tensor to store - * @param[out] indices Indices tensor to be used for store - * @param[in] k K of the top k predictions - * @param[in] n Number times to store - * return N/A - */ - void configure(ICLTensor *values, ICLTensor *indices, int k, int n); - - /** - * @brief Set buffers - * @param[out] out_key_buf Buffer of output key - * @param[out] out_ind_buf Buffer of output index - * return N/A - */ - void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); - - /* - * @brief Run CLTopKV2Store op - * @param[in] window Window to be used for in_slice - * @param[in] queue cl::CommandQueue - * @return N/A - */ - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - ICLTensor *_values; - ICLTensor *_indices; - cl::Buffer *_out_key_buf; - cl::Buffer *_out_ind_buf; -}; - -} // namespace arm_compute - -#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__ diff --git a/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h deleted file mode 100644 index f7bf72985..000000000 --- a/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ -#define __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ - -#include "arm_compute/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the normalization layer kernel. - */ -class NENormalizationLayerExKernel : public INEKernel -{ -public: - const char *name() const override { return "NENormalizationLayerKernel"; } - /** Default constructor */ - NENormalizationLayerExKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NENormalizationLayerExKernel(const NENormalizationLayerExKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NENormalizationLayerExKernel &operator=(const NENormalizationLayerExKernel &) = delete; - /** Default Move Constructor. */ - NENormalizationLayerExKernel(NENormalizationLayerExKernel &&) = default; - /** Default move assignment operator */ - NENormalizationLayerExKernel &operator=(NENormalizationLayerExKernel &&) = default; - /** Default destructor */ - ~NENormalizationLayerExKernel() = default; - /** Set the input and output tensors. - * - * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions - * [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types - * supported: FP16/F32. - * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a - * single input with dimensions [width, height, IFM], - * Data type supported: same as @p input - * @param[out] output Destination tensor. Output will have the same number of dimensions as - * input. Data type supported: same as @p input - * @param[in] norm_info Normalization layer information like the normalization type, - * normalization size and other parameters. - */ - void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, - NormalizationLayerInfo norm_info); - /** Static function to check if given info will lead to a valid configuration of @ref - * NENormalizationLayerKernel - * - * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions - * [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types - * supported: FP16/F32. - * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a - * single input with dimensions [width, height, IFM], - * Data type supported: same as @p input - * @param[in] output Destination tensor. Output will have the same number of dimensions as - * input. Data type supported: same as @p input - * @param[in] norm_info Normalization layer information like the normalization type, - * normalization size and other parameters. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, - const ITensorInfo *output, NormalizationLayerInfo norm_info); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - /** Function to perform normalization depending on the given template - * dimension. The second template parameter specifies whether the - * normalization has to be 1D or 2D. - * - * @note Only supported normalizations are: - * - 1D over X or Z - * - 2D over X and Y - * - * @param[in] window Region on which to execute the kernel. - */ - template <DataType dt, unsigned int dim, bool do_2D_norm> - void normalize_float(const Window &window); - - /** Common signature for all the specialised normalization functions - * - * @param[in] window Region on which to execute the kernel. - */ - using NormalizationFunctionEx = void (NENormalizationLayerExKernel::*)(const Window &window); - -private: - NormalizationFunctionEx _func; - const ITensor *_input; - const ITensor *_input_squared; - ITensor *_output; - NormalizationLayerInfo _norm_info; - BorderSize _border_size; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/TypesEx.h b/libs/ARMComputeEx/arm_compute/core/TypesEx.h deleted file mode 100644 index 8381f1cc6..000000000 --- a/libs/ARMComputeEx/arm_compute/core/TypesEx.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_TYPESEX_H__ -#define __ARM_COMPUTE_TYPESEX_H__ - -#include <cmath> -#include <cstddef> -#include <cstdint> -#include <string> -#include <utility> - -namespace arm_compute -{ - -/** Available ArgIndex operations **/ -enum class ArgOperation -{ - MAX, - MIN, -}; - -/** Available reduce operations */ -enum class ReduceOperation -{ - MAX, /**< Max */ - MEAN, /**< Mean */ - SUM, /**< Sum */ - MIN, /**< Min */ -}; - -/** Available binary logical operations */ -enum class BinaryLogicalOperation -{ - AND, /**< AND */ - OR, /**< OR */ -}; - -enum class ComparisonOperation -{ - EQUAL, /**< EQUAL */ - NOT_EQUAL, /**< NOT_EQUAL */ -}; - -/** Activation Layer Information class */ -class ActivationLayerInfoEx -{ -public: - /** Available activation functions */ - enum class ActivationFunction - { - RSQRT /**< Inverse Square root ( \f$ f(x) = \rsqrt{x} \f$ )*/ - }; - - ActivationLayerInfoEx() = default; - /** Default Constructor - * - * @param[in] f The activation function to use. - * @param[in] a (Optional) The alpha parameter used by some activation functions - * (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU, - * @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH). - * @param[in] b (Optional) The beta parameter used by some activation functions (@ref - * ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref - * ActivationFunction::TANH). - */ - ActivationLayerInfoEx(ActivationFunction f, float a = 0.0f, float b = 0.0f) - : _act(f), _a(a), _b(b), _enabled(true) - { - } - /** Get the type of activation function */ - ActivationFunction activation() const { return _act; } - /** Get the alpha value */ - float a() const { return _a; } - /** Get the beta value */ - float b() const { return _b; } - /** Check if initialised */ - bool enabled() const { return _enabled; } - -private: - ActivationFunction _act = {ActivationLayerInfoEx::ActivationFunction::RSQRT}; - float _a = {}; - float _b = {}; - bool _enabled = {false}; -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_TYPESEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/UtilsEx.h b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h deleted file mode 100644 index 8dd68a0c3..000000000 --- a/libs/ARMComputeEx/arm_compute/core/UtilsEx.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_UTILSEX_H__ -#define __ARM_COMPUTE_UTILSEX_H__ - -#include "arm_compute/core/TypesEx.h" - -#include <cstdint> -#include <cstdlib> -#include <sstream> -#include <string> - -namespace arm_compute -{ -/** Translates a given activation function to a string. - * - * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string. - * - * @return The string describing the activation function. - */ -const std::string &string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act); -} -#endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h deleted file mode 100644 index 7e578550f..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ -#define __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLActivationLayerExKernel - * - * @note The function simulates an activation layer with the specified activation function. - */ -class CLActivationLayerEx : public ICLSimpleFunction -{ -public: - /** Set the input and output tensor. - * - * @note If the output tensor is a nullptr or is equal to the input, the activation function will - * be performed in-place - * - * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will - * store the result - * of the activation function. Data types supported: - * QASYMM8/F16/F32. - * @param[out] output Destination tensor. Data type supported: same as @p input - * @param[in] act_info Activation layer parameters. - */ - void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLActivationLayer - * - * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor - * will store the result - * of the activation function. Data types supported: QASYMM8/F16/F32. - * @param[in] output Destination tensor info. Data type supported: same as @p input - * @param[in] act_info Activation layer information. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfoEx &act_info); -}; -} -#endif /* __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h deleted file mode 100644 index 8044c58af..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLArgMinMax.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLArgMinMax class - */ - -#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_H__ -#define __ARM_COMPUTE_CLARG_MIN_MAX_H__ - -#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to execute CLArgMinMax operation - */ -class CLArgMinMax : public IFunction -{ -public: - /** - * @brief Construct a new CLArgMinMax object - */ - CLArgMinMax(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLArgMinMax(const CLArgMinMax &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLArgMinMax &operator=(const CLArgMinMax &) = delete; - - /** - * @brief Construct a new CLArgMinMax object by using copy constructor - * @param[in] CLArgMinMax object to move - */ - CLArgMinMax(CLArgMinMax &&) = default; - - /** - * @brief Assign a CLArgMinMax object. - * @param[in] CLArgMinMax object to assign. This object will be moved. - */ - CLArgMinMax &operator=(CLArgMinMax &&) = default; - - /** - * @brief Initialise the kernel's inputs and outputs. - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. - * @param[out] output The result of argminmaxMax operation. Data types supported: same as @p - * input. - * @param[in] axis Axis to argminmax. It must be sorted and no duplicates. - * @param[in] is_min True for ArgMin operation. - * @param[in] is_max Ture for ArgMax operation. - * @return N/A - */ - void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> argminmax_axis, - ArgOperation op); - - /** - * @brief Static function to check if given info will lead to a valid configuration - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. - * @param[in] axis Axis to argminmax - * @param[out] output The result of argminmaxMax operation. Data types supported: same as @p - * input. - * @return a status - */ - static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis, - const ITensorInfo *output, ArgOperation op); - - /** - * @brief Run the kernels contained in the function - * This operation works on CPU on GPU depending on the value of argminmax_MAX_RUN_ON_CPU macro - * in CLArgMinMax.cpp. - * If argminmax_MAX_RUN_ON_CPU == 1, CPU runs this operation. - * Otherwise GPU runs this operation. - * @return N/A - */ - void run() override; - -private: - ICLTensor *_input; - ICLTensor *_output; - std::vector<uint32_t> _argminmax_axis; - ArgOperation _arg_op; - - std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; - std::unique_ptr<CLArgMinMaxKernel[]> _argminmax_kernels{nullptr}; - size_t _num_of_kernels; -}; -} -#endif /*__ARM_COMPUTE_CLargminmax_MAX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h deleted file mode 100644 index 34e6c6334..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ -#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLArithmeticSubtractionExKernel - * - * @note The tensor data type for the inputs must be U8/S16/F16/F32. - * @note The function performs an arithmetic subtraction between two tensors. - */ -class CLArithmeticSubtractionEx : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs, output and convertion policy. - * - * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32. - * The input tensor is [in, out] because its TensorInfo might be modified - * inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. - * The input tensor is [in, out] because its TensorInfo might be modified - * inside the kernel in case of broadcasting of dimension 0. - * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), - * S16/F16/F32. - * @param[in] policy Policy to use to handle overflow. - */ - void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLArithmeticSubtractionEx - * - * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32. - * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32. - * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), - * S16/F16/F32. - * @param[in] policy Policy to use to handle overflow. - * - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, ConvertPolicy policy); -}; -} -#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h deleted file mode 100644 index d16a0762d..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ -#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLBatchToSpaceNDKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. - */ -class CLBatchToSpaceND : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[in] block_size A pointer to an array of integer values specifying block sizes - * for spatial dimension. - */ - void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size); -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h deleted file mode 100644 index 061e34f26..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__ -#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -class CLBinaryLogicalOp : public ICLSimpleFunction -{ -public: - /** Initialise the function's source and destination. - * - * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8. - * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8. - * @param[out] output Output tensor. Data types supported: U8, QASYMM8. - */ - void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, - BinaryLogicalOperation op); -}; - -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h deleted file mode 100644 index 56b8408e2..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLCast.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLCast class - */ - -#ifndef __ARM_COMPUTE_CLCAST_H__ -#define __ARM_COMPUTE_CLCAST_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to run @ref CLCastKernel. - * This converts the input tensor to the tensor of the output tensor's type. - */ -class CLCast : public ICLSimpleFunction -{ -public: - /** - * @brief Initialise the kernel's input and output - * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * The input tensor is [in, out] because its TensorInfo might be - * modified inside the kernel. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - */ - void configure(ICLTensor *input, ICLTensor *output); -}; -} -#endif /* __ARM_COMPUTE_CLCAST_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h deleted file mode 100644 index 1b0d70e7f..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_H__ -#define __ARM_COMPUTE_CLCOMPARISON_OP_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ICLTensor; - -class CLComparisonOp : public ICLSimpleFunction -{ -public: - /** Initialise the function's source and destination. - * - * @param[in] input1 Source tensor1. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] input2 Source tensor2. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[out] output Output tensor. Data types supported: Same as @p input. - */ - void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, - const ComparisonOperation &op); -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h deleted file mode 100644 index d78a6ada4..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__ -#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLDepthToSpaceKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. - */ -class CLDepthToSpace : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[block_size] block size integer only - */ - void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); -}; -} // namesace arm_compute - -#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h deleted file mode 100644 index 257772a89..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLEmbeddingLookup.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLEmbeddingLookup class - */ - -#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ -#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -#include <vector> - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to perform EmbeddingLookup operation - */ -class CLEmbeddingLookup : public ICLSimpleFunction -{ -public: - /** - * @brief Set the input and output tensors. - * @param[in] input Source tensor. - * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p - * input. - * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of - * input. - * @return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); -}; -} -#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h deleted file mode 100644 index 2d0fc23a4..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLEXP_H__ -#define __ARM_COMPUTE_CLEXP_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLExpKernel */ -class CLExp : public ICLSimpleFunction -{ -public: - /** Set the source, destination of the kernel - * - * @param[in] input Source tensor. Data type supported: F32. - * @param[out] output Destination tensor. Data type supported: F32. - */ - void configure(const ICLTensor *input, ICLTensor *output); -}; -} -#endif /* __ARM_COMPUTE_CLEXP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h deleted file mode 100644 index f7fd3cda1..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLGather.h - * @brief This file contains CLGather class - * @ingroup COM_AI_RUNTIME - */ - -#ifndef __ARM_COMPUTE_CLGATHER_H__ -#define __ARM_COMPUTE_CLGATHER_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to to run @ref CLGatherKernel. - */ -class CLGather : public ICLSimpleFunction -{ -public: - /** - * @brief Initialise the kernel's inputs, output and convertion policy. - * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. - * @param[in] input2 An indexes tensor. Data types supported: S32. - * @param[out] output The output tensor, Data types supported: same as @p input1. - * @return N/A - */ - void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); - - /** - * @brief Static function to check if given info will lead to a valid configuration - * of @ref CLGather - * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. - * @param[in] input2 An indexes tensor. Data types supported: S32. - * @param[out] output The output tensor, Data types supported: same as @p input1. - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output); -}; -} -#endif /*__ARM_COMPUTE_CLGATHER_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h deleted file mode 100644 index 65aa6cbd5..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLHashtableLookup.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLHashtableLookup class - */ - -#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ -#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -#include <vector> - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to perform HashtableLookup operation - */ -class CLHashtableLookup : public ICLSimpleFunction -{ -public: - /** - * @brief Set the input and output tensors. - * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of - * input. - * @param[in] keys Keys 1D tensor. keys and input pair represent a map. - * Data types supported: S32 - * @param[in] input Source tensor. - * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p - * input. - * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits - * (True) or not (False). Data types supported: U8/QASYMM8 - * @return N/A - */ - void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, - ICLTensor *output, ICLTensor *hits); -}; -} -#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h deleted file mode 100644 index 198a0fd4e..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLNEG_H__ -#define __ARM_COMPUTE_CLNEG_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -class CLNeg : public ICLSimpleFunction -{ -public: - /** Initialise the function's source and destination. - * - * @param[in] input Source tensor. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[out] output Output tensor. Data types supported: Same as @p input. - * - */ - void configure(ICLTensor *input, ICLTensor *output); -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLNEG_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h deleted file mode 100644 index 4077245d5..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ -#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ - -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to compute a normalization layer. This function calls the following CL kernels: - * - * -# @ref CLFillBorderKernel - * -# @ref CLNormalizationLayerKernelEx - * - */ -class CLNormalizationLayerEx : public IFunction -{ -public: - /** Default constructor */ - CLNormalizationLayerEx(); - /** Set the input and output tensors. - * - * @param[in, out] input Source tensor. 3 lower dims represent a single input with dimensions - * [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types - * supported: F16/F32 (Written to by the border handler) - * @param[out] output Destination tensor. Dimensions, data type and number of channels must - * match the input ones. - * @param[in] norm_info Normalization layer information like the normalization type, - * normalization size and other parameters. - */ - void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLNormalizationLayer - * - * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions - * [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: - * F16/F32 - * @param[in] output Destination tensor. Dimensions, data type and number of channels must - * match the input ones. - * @param[in] norm_info Normalization layer information like the normalization type, normalization - * size and other parameters. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const NormalizationLayerInfo &norm_info); - - // Inherited methods overridden: - void run() override; - -private: - CLNormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel to run */ - CLFillBorderKernel _border_handler; /**< Kernel to handle borders */ -}; -} -#endif /* __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h deleted file mode 100644 index 622a61b5e..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLPRELU_H__ -#define __ARM_COMPUTE_CLPRELU_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -class CLPReLU : public ICLSimpleFunction -{ -public: - /** Initialise the function's source and destination. - * - * @param[in] input. Data types supported: - * QASYMM8/F16/F32. - * @param[in] alpha. Data types supported: - * QASYMM8/F16/F32. - * @param[out] output Output tensor. Data types supported: Same as @p input. - */ - void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output); -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLPRELU_H__*/ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h deleted file mode 100644 index d6ea486d1..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h +++ /dev/null @@ -1,47 +0,0 @@ -/* -* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved -* Copyright (c) 2016-2018 ARM Limited. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#ifndef __ARM_COMPUTE_CLPADLAYEREX_H__ -#define __ARM_COMPUTE_CLPADLAYEREX_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLPadLayerKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. - */ -class CLPadLayerEx : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: - * U8/QASYMM8/S16/S32/F16/F32. - * @param[out] output Output tensor. Data types supported: - * U8/QASYMM8/S16/S32/F16/F32. - * @param[in] pad_size Tensor for Padding values in NHWC format shape [n, 2], - * where n is the rank of tensor . Data types supported: S32 - */ - void configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size); -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLPADLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h deleted file mode 100644 index 9a0cc213c..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLPERMUTEEX_H__ -#define __ARM_COMPUTE_CLPERMUTEEX_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to execute an @ref CLPermuteKernel. */ -class CLPermuteEx : public ICLSimpleFunction -{ -public: - /** Set the input and output tensors. - * - * @param[in] input The input tensor to permute. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] output The output tensor. Data types supported: Same as @p input - * @param[in] perm Permutation vector - */ - void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); - /** Static function to check if given info will lead to a valid configuration of @ref CLPermute. - * - * @param[in] input First tensor input info. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] output Output tensor info. Data types supported: same as @p input. - * @param[in] perm Permutation vector - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const PermutationVector &perm); -}; -} -#endif /*__ARM_COMPUTE_CLPERMUTEEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h deleted file mode 100644 index b142d3a2e..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLPixelWiseDivision.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLPixelWiseDivision class - */ -#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ -#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to run @ref CLPixelWiseDivisionKernel. - */ -class CLPixelWiseDivision : public ICLSimpleFunction -{ -public: - /** - * @brief Initialise the kernel's inputs, output and convertion policy. - * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32 - * The input tensor is [in, out] because its TensorInfo might be - * modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. - * The input tensor is [in, out] because its TensorInfo might be - * modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] output The output tensor, Data types supported: same as @p input1. - * Note: U8 requires both inputs to be U8. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or - * 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest - * even. - * @return N/A - */ - void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, - ConvertPolicy overflow_policy = ConvertPolicy::WRAP, - RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLPixelWiseDivision - * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32 - * @param[in] input2 An input tensor info. Data types supported: same as @p input1. - * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 requires both inputs to be U8. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, float scale = 1.f, - ConvertPolicy overflow_policy = ConvertPolicy::WRAP, - RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); -}; -} -#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h deleted file mode 100644 index e1a6f6ab4..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLReduceOperation.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLReduceOperation class - */ - -#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__ -#define __ARM_COMPUTE_CLREDUCEOPERATION_H__ - -#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" -#include "arm_compute/core/TypesEx.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to perform ReduceOperation - */ -class CLReduceOperation : public IFunction -{ -public: - /** - * @brief Construct a new ReduceOperation object - */ - CLReduceOperation(); - - /** - * @brief Set the input and output tensors. - * @param[in] input Source tensor. Data types supported: U8/S32/F32 - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p - * input. - * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. - * @param[in] op Reduce operation to perform. - * @return N/A - */ - void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, - ReduceOperation op); - - /** - * @brief Static function to check if given info will lead to a valid configuration of @ref - * CLReduceOperation. - * @param[in] input Source tensor info. Data types supported: U8/S32/F32 - * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p - * input. - * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. - * @param[in] op Reduce operation to perform. - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const std::set<uint32_t> &axis, const ReduceOperation &op); - - /** - * @brief Run the OpenCL kernel for this operation - * @return N/A - */ - void run() override; - -private: - ICLTensor *_input; - ICLTensor *_output; - std::set<uint32_t> _axis; - - std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; - std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; -}; -} -#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h deleted file mode 100644 index 7e2df8986..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ -#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLSpaceToBatchNDKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32. - * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape - * block_shape, and interleaves these blocks with the "batch" dimension such that in the output. - */ -class CLSpaceToBatchND : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's input and output. - * - * @note The data layout of input and output must be the same. - * @note The number of dimensions of input and output must be 4, and `spatial` dimensions - * are height and width. - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. - * Data layout supported: NCHW/NHWC - * @param[in] block_size Tensor of integer values specifying block sizes for spatial - * dimension. - * Data types supported: S32 - * @param[in] padding_size Tensor of integer values specifying padding sizes for spatial - * dimension. - * Data types supported: S32 - * @param[out] output Output tensor. Data types supported: same as @p input. - * Data layout supported: NCHW/NHWC - */ - void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, - ICLTensor *output); -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h deleted file mode 100644 index 17f762092..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__ -#define __ARM_COMPUTE_CLSPACETODEPTH_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLSpaceToDepthKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. - */ -class CLSpaceToDepth : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * @param[block_size] block size integer only - */ - void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); -}; - -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h deleted file mode 100644 index 3610ba71c..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__ -#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -class CLSquaredDifference : public ICLSimpleFunction -{ -public: - /** Initialise the function's source and destination. - * - * @param[in] input1 Source tensor1. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[in] input2 Source tensor2. Data types supported: - * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. - * @param[out] output Output tensor. Data types supported: Same as @p input. - */ - void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__*/ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h deleted file mode 100644 index 6b26a85c8..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLStridedSlice.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class - */ - -#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ -#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ - -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to run @ref CLStridedSliceKernel - */ -class CLStridedSliceEx : public ICLSimpleFunction -{ -public: - /** - * @brief Initialise the kernel's inputs and outputs - * @param[in] input Tensor input. Data type supported: - * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 - * @param[out] output Output tensor. Data type supported: Same as @p input - * @param[in] beginData 'begin' vector of strided slice operation - * @param[in] endData 'end' vector of strided slice operation - * @param[in] stridesData 'strides' vector of strided slice operation - * @param[in] beginMask If the ith bit is set, begin[i] is ignored - * @param[in] endMask If the ith bit is set, end[i] is ignored - * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the - * dimensionality by 1, taking on the value at index begin[i] - * @return N/A - */ - void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask); -}; -} -#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h deleted file mode 100644 index 5327e016f..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file CLTopKV2.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains arm_compute::CLTopKV2 class - */ -#ifndef __ARM_COMPUTE_CLTOPK_V2_H__ -#define __ARM_COMPUTE_CLTOPK_V2_H__ - -#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" - -#include "arm_compute/runtime/IFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** - * @brief Class to execute TopKV2 operation. - */ -class CLTopKV2 : public IFunction -{ -public: - /** - * @brief Construct a new CLTopKV2 object - */ - CLTopKV2(); - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLTopKV2(const CLTopKV2 &) = delete; - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - */ - CLTopKV2 &operator=(const CLTopKV2 &) = delete; - - /** - * @brief Construct a new CLTopKV2 object by using copy constructor - * @param[in] CLTopKV2 object to move - */ - CLTopKV2(CLTopKV2 &&) = default; - - /** - * @brief Assign a CLTopKV2 object. - * @param[in] CLTopKV2 object to assign. This object will be moved. - */ - CLTopKV2 &operator=(CLTopKV2 &&) = default; - - /** - * @brief Initialise the kernel's inputs and outputs. - * @param[in] input Input image. Data types supported: U8/S16/F32. - * @param[in] k The value of `k`. - * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if - * input type is F32. - * @param[out] indices Indices related to top k values. Data types supported: S32 if input type - * is U8/S16, F32 if input type is F32. - * @return N/A - */ - void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, - int total_bits = 32, int bits = 4); - - /** - * @brief Run the kernels contained in the function - * Depending on the value of the following environment variables it works differently: - * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE", - * quick sort on GPU is used. - * - If the value of environment variable "ACL_TOPKV2" == ""GPU"", - * radix sort on GPU is used. - * - For other value, TopKV2 runs on CPU - * @return N/A - */ - void run() override; - -private: - void run_on_cpu(); - void run_on_gpu(); - void run_on_gpu_single_quicksort(); - - uint32_t _k; - uint32_t _total_bits; - uint32_t _bits; - uint32_t _radix; - uint32_t _hist_buf_size; - uint32_t _glob_sum_buf_size; - uint32_t _n; - - ICLTensor *_input; - ICLTensor *_values; - ICLTensor *_indices; - - cl::Buffer _qs_idx_buf; - cl::Buffer _qs_temp_buf; - cl::Buffer _hist_buf; - cl::Buffer _glob_sum_buf; - cl::Buffer _temp_buf; - cl::Buffer _first_negative_idx_buf; - cl::Buffer _in_key_buf; - cl::Buffer _out_key_buf; - cl::Buffer _in_ind_buf; - cl::Buffer _out_ind_buf; - - cl::Buffer *_p_in_key_buf; - cl::Buffer *_p_out_key_buf; - cl::Buffer *_p_in_ind_buf; - cl::Buffer *_p_out_ind_buf; - - CLTopKV2Single _qs_kernel; - CLTopKV2Init _init_kernel; - CLRadixSortHistogram _hist_kernel; - CLRadixSortScanHistogram _scan_hist_kernel; - CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel; - CLRadixSortPasteHistogram _paste_hist_kernel; - CLRadixSortReorder _reorder_kernel; - CLTopKV2FindFirstNegative _find_first_negative_kernel; - CLTopKV2ReorderNegatives _reorder_negatives_kernel; - CLTopKV2Store _store_kernel; -}; -} -#endif // __ARM_COMPUTE_CLTOPK_V2_H__ diff --git a/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h deleted file mode 100644 index fa7408ecd..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ -#define __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ - -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h" -#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" -#include "arm_compute/runtime/MemoryGroup.h" - -namespace arm_compute -{ -class ITensor; - -/** Basic function to compute a normalization layer. This function calls the following NEON kernels: - * - * -# @ref NEPixelWiseMultiplicationKernel - * -# @ref NEFillBorderKernel - * -# @ref NENormalizationLayerKernelEx - * - */ -class NENormalizationLayerEx : public IFunction -{ -public: - /** Default constructor */ - NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); - /** Set the input and output tensors. - * - * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions - * [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data type supported: - * F16/F32 - * @param[out] output Destination with the same dimensions, data type and number of channels of - * @p input - * @param[in] norm_info Normalization layer information like the normalization type, - * normalization size and other parameters. - */ - void configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info); - /** Static function to check if given info will lead to a valid configuration of @ref - * NENormalizationLayer - * - * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions - * [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data type supported: - * F16/F32 - * @param[in] output Destination with the same dimensions, data type and number of channels of - * @p input - * @param[in] norm_info Normalization layer information like the normalization type, normalization - * size and other parameters. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const NormalizationLayerInfo &norm_info); - - // Inherited methods overridden: - void run() override; - -private: - MemoryGroup _memory_group; /**< Function memory group */ - NENormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel */ - NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */ - NEFillBorderKernel _border_handler; /**< Kernel to handle borders */ - Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ -}; -} -#endif /* __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/resolve_includes.py b/libs/ARMComputeEx/resolve_includes.py deleted file mode 100644 index b3e252892..000000000 --- a/libs/ARMComputeEx/resolve_includes.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved -# Copyright (c) 2016, 2017 ARM Limited. -# -# SPDX-License-Identifier: MIT -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import collections -import os.path -import re -import subprocess -import glob - - -def resolve_includes(target, source): - # File collection - FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents') - - # Include pattern - pattern = re.compile("#include \"(.*)\"") - - # Get file contents - files = [] - for i in range(len(source)): - src = source[i] - dst = target[i] - f = open(src) - cts = f.read() - f.close() - contents = cts.splitlines() - entry = FileEntry(target_name=dst, file_contents=contents) - files.append((os.path.basename(src), entry)) - - # Create dictionary of tupled list - files_dict = dict(files) - - # Check for includes (can only be files in the same folder) - final_files = [] - for file in files: - done = False - tmp_file = file[1].file_contents - print(file[1].target_name) - while not done: - file_count = 0 - updated_file = [] - for line in tmp_file: - found = pattern.search(line) - if found: - include_file = found.group(1) - data = files_dict[include_file].file_contents - updated_file.extend(data) - else: - updated_file.append(line) - file_count += 1 - - # Check if all include are replaced. - if file_count == len(tmp_file): - done = True - - # Update temp file - tmp_file = updated_file - - # Append and prepend string literal identifiers and add expanded file to final list - tmp_file.insert(0, "R\"(\n") - tmp_file.append("\n)\"") - entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file) - final_files.append((file[0], entry)) - - # Write output files - for file in final_files: - with open(file[1].target_name, 'w+') as out_file: - out_file.write("\n".join(file[1].file_contents)) - - -# Generate embed files -cl_files = glob.glob('src/core/CL/cl_kernels/*.cl') -cl_files += glob.glob('src/core/CL/cl_kernels/*.h') - -# DEBUG: print cl files -print("cl_files:") -print(cl_files) - -embed_files = [f + "embed" for f in cl_files] -print("embed_files:") -print(embed_files) - -resolve_includes(embed_files, cl_files) diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp deleted file mode 100644 index 05ecdeb22..000000000 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Utils.h" - -#include <algorithm> -#include <fstream> -#include <iostream> -#include <utility> -#include <vector> - -using namespace arm_compute; - -const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { - // ARMComputeEx kernels - {"activation_layer_ex", "activation_layer_ex.cl"}, - {"arg_op", "arg_operation.cl"}, - {"arithmetic_sub_ex", "arithmetic_op_ex.cl"}, - {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"}, - {"batch_to_space_nd", "batch_to_space_nd.cl"}, - {"binary_logical_op", "binary_logical_op.cl"}, - {"cast", "cast.cl"}, - {"cast_qasymm_in", "cast.cl"}, - {"cast_qasymm_out", "cast.cl"}, - {"comparison_op", "comparison_op.cl"}, - {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, - {"depth_to_space", "depth_to_space.cl"}, - {"embedding_lookup", "embedding_lookup.cl"}, - {"exp_layer", "exp.cl"}, - {"gather", "gather.cl"}, - {"gather_1d", "gather.cl"}, - {"gather_1d_out", "gather.cl"}, - {"hashtable_lookup", "hashtable_lookup.cl"}, - {"neg_tensor", "neg_tensor.cl"}, - {"pad", "pad.cl"}, - {"permute_generic", "permute_ex.cl"}, - {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, - {"pixelwise_div_float", "pixelwise_div_float.cl"}, - {"pixelwise_div_int", "pixelwise_div_int.cl"}, - {"prelu", "prelu.cl"}, - {"prelu_qasymm8", "prelu_quantized.cl"}, - {"reduce_min_max", "reduce_operation.cl"}, - {"reduce_sum_mean", "reduce_operation.cl"}, - {"squared_difference", "squared_difference.cl"}, - {"strided_slice_ex", "strided_slice_ex.cl"}, - {"topkv2_init", "topkv2.cl"}, - {"topkv2_find_first_negative", "topkv2.cl"}, - {"topkv2_reorder_negatives", "topkv2.cl"}, - {"topkv2_store", "topkv2.cl"}, - {"radixsort_histogram", "topkv2_radixsort.cl"}, - {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, - {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, - {"radixsort_reorder", "topkv2_radixsort.cl"}, - {"topkv2_quicksort", "topkv2_quicksort.cl"}, - {"space_to_batch_4d_nchw", "space_to_batch.cl"}, - {"space_to_batch_4d_nhwc", "space_to_batch.cl"}, - {"space_to_depth", "space_to_depth.cl"}, -}; - -const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { -#ifdef EMBEDDED_KERNELS - { - "activation_layer_ex.cl", -#include "./cl_kernels/activation_layer_ex.clembed" - }, - { - "arg_operation.cl", -#include "./cl_kernels/arg_operation.clembed" - }, - { - "arithmetic_op_ex.cl", -#include "./cl_kernels/arithmetic_op_ex.clembed" - }, - { - "batch_to_space_nd.cl", -#include "./cl_kernels/batch_to_space_nd.clembed" - }, - { - "cast.cl", -#include "./cl_kernels/cast.clembed" - }, - { - "comparison_op.cl", -#include "./cl_kernels/comparison_op.clembed" - }, - { - "comparison_op_quantized.cl", -#include "./cl_kernels/comparison_op_quantized.clembed" - }, - { - "embedding_lookup.cl", -#include "./cl_kernels/embedding_lookup.clembed" - }, - { - "depth_to_space.cl", -#include "./cl_kernels/depth_to_space.clembed" - }, - { - "exp.cl", -#include "./cl_kernels/exp.clembed" - }, - { - "gather.cl", -#include "./cl_kernels/gather.clembed" - }, - { - "hashtable_lookup.cl", -#include "./cl_kernels/hashtable_lookup.clembed" - }, - { - "helpers.h", -#include "./cl_kernels/helpers.hembed" - }, - { - "helpers_asymm.h", -#include "./cl_kernels/helpers_asymm.hembed" - }, - { - "binary_logical_op.cl", -#include "./cl_kernels/binary_logical_op.clembed" - }, - { - "neg_tensor.cl", -#include "./cl_kernels/neg_tensor.clembed" - }, - { - "pad.cl", -#include "./cl_kernels/pad.clembed" - }, - { - "pixelwise_div_float.cl", -#include "./cl_kernels/pixelwise_div_float.clembed" - }, - { - "pixelwise_div_int.cl", -#include "./cl_kernels/pixelwise_div_int.clembed" - }, - { - "prelu.cl", -#include "./cl_kernels/prelu.clembed" - }, - { - "prelu_quantized.cl", -#include "./cl_kernels/prelu_quantized.clembed" - }, - { - "reduce_operation.cl", -#include "./cl_kernels/reduce_operation.clembed" - }, - { - "space_to_batch.cl", -#include "./cl_kernels/space_to_batch.clembed" - }, - { - "space_to_depth.cl", -#include "./cl_kernels/space_to_depth.clembed" - }, - { - "squared_difference.cl", -#include "./cl_kernels/squared_difference.clembed" - }, - { - "strided_slice_ex.cl", -#include "./cl_kernels/strided_slice_ex.clembed" - }, - { - "topkv2.cl", -#include "./cl_kernels/topkv2.clembed" - }, - { - "topkv2_radixsort.cl", -#include "./cl_kernels/topkv2_radixsort.clembed" - }, - { - "topkv2_quicksort.cl", -#include "./cl_kernels/topkv2_quicksort.clembed" - }, - { - "permute_ex.cl", -#include "./cl_kernels/permute_ex.clembed" - }, - -#endif /* EMBEDDED_KERNELS */ -}; - -CLKernelLibraryEx::CLKernelLibraryEx() - : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() -{ - opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the - // CLKernelLibraryEx is built -} - -CLKernelLibraryEx &CLKernelLibraryEx::get() -{ - static CLKernelLibraryEx _kernel_library; - return _kernel_library; -} - -Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name, - const StringSet &build_options_set) const -{ - // Find which program contains the kernel - auto kernel_program_it = _kernel_program_map.find(kernel_name); - - if (_kernel_program_map.end() == kernel_program_it) - { - ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); - } - std::string concat_str; - - if (fp16_supported()) - { - concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; - } - - if (get_cl_version(_device) == CLVersion::CL20) - { - concat_str += " -cl-std=CL2.0 "; - } - else if (arm_non_uniform_workgroup_supported(_device)) - { - concat_str += " -cl-arm-non-uniform-work-group-size "; - } - else - { - ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); - } - - // Check if the program has been built before with same build options. - const std::string program_name = kernel_program_it->second; - const std::string build_options = stringify_set(build_options_set) + concat_str; - - const std::string built_program_name = program_name + "_" + build_options; - auto built_program_it = _built_programs_map.find(built_program_name); - - cl::Program cl_program; - - if (_built_programs_map.end() != built_program_it) - { - // If program has been built, retrieve to create kernel from it - cl_program = built_program_it->second; - } - else - { - // Get program - Program program = load_program(program_name); - - // Build program - cl_program = program.build(build_options); - - // Add built program to internal map - _built_programs_map.emplace(built_program_name, cl_program); - } - - // Create and return kernel - return Kernel(kernel_name, cl_program); -} - -void CLKernelLibraryEx::add_built_program(const std::string &built_program_name, - cl::Program program) -{ - _built_programs_map.emplace(built_program_name, program); -} - -bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); } - -bool CLKernelLibraryEx::int64_base_atomics_supported() const -{ - return device_supports_extension(_device, "cl_khr_int64_base_atomics"); -} - -const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const -{ - const auto program_it = _programs_map.find(program_name); - - if (program_it != _programs_map.end()) - { - return program_it->second; - } - - Program program; - -#ifdef EMBEDDED_KERNELS - const auto program_source_it = _program_source_map.find(program_name); - - if (_program_source_map.end() == program_source_it) - { - ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); - } - - program = Program(_context, program_name, program_source_it->second); -#else /* EMBEDDED_KERNELS */ - // Check for binary - std::string source_name = _kernel_path + program_name; - std::string binary_name = source_name + "bin"; - - if (std::ifstream(binary_name).is_open()) - { - const std::string program_binary = read_file(binary_name, true); - program = Program(_context, _device, program_name, - std::vector<unsigned char>(program_binary.begin(), program_binary.end())); - } - else if (std::ifstream(source_name).is_open()) - { - program = Program(_context, program_name, read_file(source_name, false)); - } - else - { - ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str()); - } -#endif /* EMBEDDED_KERNELS */ - - // Insert program to program map - const auto new_program = _programs_map.emplace(program_name, std::move(program)); - - return new_program.first->second; -} - -std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const -{ - std::string concat_set; - -#ifndef EMBEDDED_KERNELS - concat_set += "-I" + _kernel_path + " "; -#endif /* EMBEDDED_KERNELS */ - - // Concatenate set - for (const auto &el : s) - { - concat_set += " " + el; - } - - return concat_set; -} - -std::string CLKernelLibraryEx::get_program_source(const std::string &program_name) -{ - const auto program_source_it = _program_source_map.find(program_name); - - if (program_source_it == _program_source_map.end()) - { - ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); - } - - return program_source_it->second; -} - -size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const -{ - size_t result; - - size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); - ARM_COMPUTE_ERROR_ON_MSG( - err != 0, - "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); - ARM_COMPUTE_UNUSED(err); - - return result; -} - -cl::NDRange CLKernelLibraryEx::default_ndrange() const -{ - // GPUTarget _target = get_target_from_device(_device); - cl::Device device = cl::Device::getDefault(); - GPUTarget _target = get_target_from_device(device); - cl::NDRange default_range; - - switch (_target) - { - case GPUTarget::MIDGARD: - case GPUTarget::T600: - case GPUTarget::T700: - case GPUTarget::T800: - default_range = cl::NDRange(128u, 1); - break; - default: - default_range = cl::NullRange; - } - - return default_range; -} - -std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); } diff --git a/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp deleted file mode 100644 index cbda169fb..000000000 --- a/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CL/OpenCLEx.h" - -#include <dlfcn.h> -#include <iostream> - -namespace arm_compute -{ -CLSymbolsEx &CLSymbolsEx::get() -{ - static CLSymbolsEx symbols; - return symbols; -} - -bool CLSymbolsEx::load_default() -{ - static const std::vector<std::string> libraries{"libOpenCL.so", "libGLES_mali.so", "libmali.so"}; - - if (_loaded.first) - { - return _loaded.second; - } - - // Indicate that default loading has been tried - _loaded.first = true; - - for (const auto &lib : libraries) - { - if (load(lib)) - { - return true; - } - } - - std::cerr << "Couldn't find any OpenCL library.\n"; - return false; -} - -bool CLSymbolsEx::load(const std::string &library) -{ - void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL); - - if (handle == nullptr) - { - std::cerr << "Can't load " << library << ": " << dlerror() << "\n"; - // Set status of loading to failed - _loaded.second = false; - return false; - } - -#define LOAD_FUNCTION_PTR(func_name, handle) \ - func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name)); - - LOAD_FUNCTION_PTR(clGetEventInfo, handle); - LOAD_FUNCTION_PTR(clSetEventCallback, handle); - -#undef LOAD_FUNCTION_PTR - - // Don't call dlclose(handle) or all the symbols will be unloaded ! - - // Disable default loading and set status to successful - _loaded = std::make_pair(true, true); - - return true; -} - -} // namespace arm_compute - -cl_int clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size, - void *param_value, size_t *param_value_size_ret) -{ - arm_compute::CLSymbolsEx::get().load_default(); - auto func = arm_compute::CLSymbolsEx::get().clGetEventInfo_ptr; - if (func != nullptr) - { - return func(event, param_name, param_value_size, param_value, param_value_size_ret); - } - else - { - return CL_OUT_OF_RESOURCES; - } -} - -cl_int clSetEventCallback(cl_event event, cl_int command_exec_callback_type, - void(CL_CALLBACK *pfn_ev_notify)(cl_event ev, cl_int ev_cmd_exec_status, - void *user_data), - void *user_data) -{ - arm_compute::CLSymbolsEx::get().load_default(); - auto func = arm_compute::CLSymbolsEx::get().clSetEventCallback_ptr; - if (func != nullptr) - { - return func(event, command_exec_callback_type, pfn_ev_notify, user_data); - } - else - { - return CL_OUT_OF_RESOURCES; - } -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl deleted file mode 100644 index f54c7bde3..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - -#define CONST_ONE 1.f -#define DIV_OP(a, b) ((a) / (b)) -#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a))) - -// Inverse Square-root Activation -inline TYPE rsqrt_op(TYPE x) -{ - return RSQRT_OP(x); -} - -#define ACTIVATION_OP2(op, x) op##_op(x) -#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) - -#if defined(ACT) - -/** This performs an activation function floating point inputs. - * - * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH - * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void activation_layer_ex( - TENSOR3D_DECLARATION(input) -#ifndef IN_PLACE - , - TENSOR3D_DECLARATION(output) -#endif /* not IN_PLACE */ -) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); -#ifdef IN_PLACE - Tensor3D output = input; -#else /* IN_PLACE */ - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); -#endif /* IN_PLACE */ - - // Load data - TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr); - - // Perform activation - data = ACTIVATION_OP(ACT, data); - - // Store result - VSTORE(VEC_SIZE) - (data, 0, (__global DATA_TYPE *)output.ptr); -} - -#endif /* defined(ACT) */ diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl deleted file mode 100644 index 9a6921d7c..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) -/** Perform arg_max/arg_min - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: U32 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] axis Axis through which reduction occurs for max value index - * @param[in] dim Dimension across the axis to be reduced. - */ - -__kernel void arg_op(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - DATA_TYPE tval = value; - int idx = 0; - for(int i = 1; i < dim; ++i) - { - indices[axis] = i; - - #if OP_CODE == 1 // ArgMax - value = max(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - #elif OP_CODE == 2 //ArgMin - value = min(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - #else - return; - - #endif - - if(tval!=value) - { - idx = indices[axis]; - tval = value; - } - } - - *((__global uint *)out.ptr) = idx; -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl deleted file mode 100644 index 2ed698951..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifdef SATURATE -#define SUB(x, y) sub_sat((x), (y)) -#else /* SATURATE */ -#define SUB(x, y) (x) - (y) -#endif /* SATURATE */ - -/** This function subtracts one tensors from another. - * - * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. - * - * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16 - * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16 - * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16 - * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void arithmetic_sub_ex( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load values - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); - - // Calculate and store result - vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl deleted file mode 100644 index 5cd0a4309..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers_asymm.h" - -#ifdef SATURATE -#define ADD(x, y) add_sat((x), (y)) -#define SUB(x, y) sub_sat((x), (y)) -#else /* SATURATE */ -#define ADD(x, y) (x) + (y) -#define SUB(x, y) (x) - (y) -#endif /* SATURATE */ - -/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 - * - * The following computations will be performed: - * - * -# Add offset terms to inputs - -# Get scaled value of two inputs - * -# Add inputs - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Shift the int32 accumulator by result_shift - * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. - * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT - * - * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: - * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f - * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: - * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. - * - * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] in2_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] out_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void arithmetic_add_qasymm8( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(int, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); - VEC_DATA_TYPE(int, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); - - // Get scaled value of two inputs - VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); - VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); - - VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); - VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; - VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; - - VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); - VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); - - // Add inputs and multiply with a multiplier smaller than 1 - VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; - VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); - out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); - - VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); - -// TODO: Apply min-max BOUND to support fuse with relu. -/* -#if defined(MIN_BOUND) - res = max(res, (uchar16)MIN_BOUND); -#endif // defined(MIN_BOUND) -#if defined(MAX_BOUND) - res = min(res, (uchar16)MAX_BOUND); -#endif // defined(MAX_BOUND) -*/ - - // Store result - VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), - 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl deleted file mode 100644 index ad6a48a02..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) -/** Perform batch to space rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void batch_to_space_nd( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int out_index[4]={0}; - int in_index[4]={0}; - - out_index[0] = get_global_id(0);//W - out_index[1] = get_global_id(1);//H - out_index[2] = get_global_id(2) % DEPTH_OUT;//C - out_index[3] = get_global_id(2) / DEPTH_OUT;//N - - in_index[0] = out_index[0]/BLOCK_SIZE1; - in_index[1] = out_index[1]/BLOCK_SIZE0; - in_index[2] = out_index[2]; - in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); - } -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl deleted file mode 100644 index bea61f53e..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(OP_CODE) && defined(DATA_TYPE) -/** returns truth value of the two input tensors for BINARY LOGICAL OP. - * where BINARY LOGICAL OP can be AND, OR. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[in] input2_ptr Pointer to the source tensor.Supported data types: QASYMM8 - * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - */ -__kernel void binary_logical_op( - TENSOR3D_DECLARATION(input1), - TENSOR3D_DECLARATION(input2), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); - Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - #if OP_CODE == 1 // LOGICAL AND - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); - - #elif OP_CODE == 2 // LOGICAL OR - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); - - #else // OP NOT SUPPORTED - return - - #endif -} -#endif //if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl deleted file mode 100644 index 3d4675e5d..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef SCALE -#define SCALE 1.0f -#endif -#ifndef OFFSET -#define OFFSET 0 -#endif -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) -/** Perform a cast operation on an input tensor. - * - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void cast( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); -} - -/** Perform a cast operation on an QASYMM8 input tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void cast_qasymm_in( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - - VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; - - VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); -} - - -/** Perform a cast operation on an QASYMM8 output tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void cast_qasymm_out( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - - VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); - - VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); -} -#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl deleted file mode 100644 index 765072556..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) -/** Returns truth value of comparison operators. - * Comparison operators may be equal, not_equal etc. - * - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT, - * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input1_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[in] input2_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void comparison_op( - TENSOR3D_DECLARATION(input1), - TENSOR3D_DECLARATION(input2), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); - Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - #if OP_CODE == 1 //EQUAL - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr); - - #elif OP_CODE == 2 //NOT_EQUAL - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); - - #else // OP NOT SUPPORTED - return; - - #endif -} -#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl deleted file mode 100644 index 1eb305f7b..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" -#define SUB(x, y) (x) - (y) - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) - -#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) -#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) - -/** Returns the truth value of comparison . - * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[in] input2_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void comparison_op_qasymm8( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT); - VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT); - - in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1)); - in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2)); - - const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); - const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); - - #if OPCODE == 1 //EQUAL QUANTIZED - VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); - - #elif OPCODE == 2 //NOT EQUAL QUANTIZED - VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); - - #else // OP NOT SUPPORTED - return; - - #endif -} -#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl deleted file mode 100644 index fef2243e7..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void depth_to_space( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int out_index[4]={0}; - int in_index[4]={0}; - - out_index[0] = get_global_id(0);//W - out_index[1] = get_global_id(1);//H - out_index[2] = get_global_id(2) % DEPTH_OUT;//C - out_index[3] = get_global_id(2) / DEPTH_OUT;//B - - in_index[0] = out_index[0]/BLOCK_SIZE; - in_index[1] = out_index[1]/BLOCK_SIZE; - in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; - in_index[3] = out_index[3]; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); - } -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl deleted file mode 100644 index 348458fe9..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) -/** Perform embedding_lookup of input tensor - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 - * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector - */ - -__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(lookups)) -{ - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); - - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - - //lookup ids for based on the tensor dimensions - int lup_id[4] = {0}; - - lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) - :get_global_id(0); - lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) - :get_global_id(1); - lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) - :get_global_id(2)%DEPTH_OUT; - lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - :get_global_id(2) / DEPTH_OUT; - - in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y - + lup_id[2] * input_step_z + lup_id[3] * input_step_w; - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl deleted file mode 100644 index 69d94f30a..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Perform an exponential operation on an input tensor. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Can only take floating point data types. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void exp_layer( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl deleted file mode 100644 index 6b767d6c9..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -/** Perform gather - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * - * @param[in] input1_ptr Pointer to the first source tensor. Supported data types: U8/S32/F32 - * @param[in] input1_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input1_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[in] input2_ptr Pointer to the first source tensor. Supported data types: U32 - * @param[in] input2_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input2_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void gather(IMAGE_DECLARATION(input1), - VECTOR_DECLARATION(input2), - IMAGE_DECLARATION(output)) -{ - Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1); - Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); - Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output); - - VEC_DATA_TYPE(DATA_TYPE_IN2, 2) - in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); - - //TODO: performance tuning for memcopy - int index = in2_data.s0; - int stride=input1_stride_y/input1_stride_x; - - for(int i=0; i<stride; i++){ - *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index)); - } -} - -__kernel void gather_1d_out(IMAGE_DECLARATION(input1), - VECTOR_DECLARATION(input2), - VECTOR_DECLARATION(output)) -{ - Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1); - Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); - Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); - - VEC_DATA_TYPE(DATA_TYPE_IN2, 2) - in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); - - //TODO: performance tuning for memcopy - int index = in2_data.s0; - int stride=input1_stride_y/input1_stride_x; - - for(int i=0; i<stride; i++){ - *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index)); - } -} - -__kernel void gather_1d(VECTOR_DECLARATION(input1), - VECTOR_DECLARATION(input2), - VECTOR_DECLARATION(output)) -{ - Vector in1 = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1); - Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); - Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); - - VEC_DATA_TYPE(DATA_TYPE_IN2, 2) - in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); - - //TODO: performance tuning for memcopy - int index = in2_data.s0; - *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index)); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl deleted file mode 100644 index ed7409852..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) -/** Perform hashtable_lookup of input tensor - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 - * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector - */ -__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(lookups)) -{ - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); - - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - - int lup_id[4] = {0}; - - lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) - :get_global_id(0); - lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) - :get_global_id(1); - lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) - :get_global_id(2)%DEPTH_OUT; - lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - :get_global_id(2) / DEPTH_OUT; - - if (lup_id[NUM_DIMS-1] < 0) - { - VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); - return; - } - - in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y - + lup_id[2] * input_step_z + lup_id[3] * input_step_w; - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h deleted file mode 100644 index 0e123ae0a..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_HELPER_H -#define ARM_COMPUTE_HELPER_H - -#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) - -#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) -#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) - -#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ - defined(cl_arm_integer_dot_product_accumulate_int8) -#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && - // defined(cl_arm_integer_dot_product_accumulate_int8) - -#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) -#pragma OPENCL EXTENSION cl_arm_printf : enable -#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) - -#define EXPAND(x) x - -#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) - -#define VLOAD_STR(size) vload##size -#define VLOAD(size) VLOAD_STR(size) - -#define VSTORE_STR(size) vstore##size -#define VSTORE(size) VSTORE_STR(size) - -#define VEC_DATA_TYPE_STR(type, size) type##size -#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) - -#define CL_VEC_DATA_TYPE_STR(type, size) type##size -#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size) - -#define CONVERT_STR(x, type) (convert_##type((x))) -#define CONVERT(x, type) CONVERT_STR(x, type) - -#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) - -#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) -#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) - -#define VECTOR_DECLARATION(name) \ - __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ - uint name##_offset_first_element_in_bytes - -#define IMAGE_DECLARATION(name) \ - __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_offset_first_element_in_bytes - -#define TENSOR3D_DECLARATION(name) \ - __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR4D_DECLARATION(name) \ - __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ - uint name##_step_w, uint name##_offset_first_element_in_bytes - -#define CONVERT_TO_VECTOR_STRUCT(name) \ - update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - name##_step_x) - -#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ - update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) - -#define CONVERT_TO_IMAGE_STRUCT(name) \ - update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - name##_step_x, name##_stride_y, name##_step_y) - -#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ - update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ - name##_stride_y, 0) - -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ - name##_stride_x, name##_step_x, name##_stride_y, \ - name##_step_y, name##_stride_z, name##_step_z) - -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ - name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \ - name##_step_z) - -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ - name##_stride_x, name##_step_x, name##_stride_y, \ - name##_step_y, name##_stride_z, name##_step_z) - -#define CONVERT_TO_TENSOR3D_STRUCT(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ - name##_step_z) - -#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - 0, name##_stride_y, 0, name##_stride_z, 0) - -#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ - name##_step_z, name##_stride_w, name##_step_w, mod_size) - -#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ - 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \ - mod_size) - -/** Structure to hold Vector information */ -typedef struct Vector -{ - __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ - int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ - int stride_x; /**< Stride of the image in X dimension (in bytes) */ -} Vector; - -/** Structure to hold Image information */ -typedef struct Image -{ - __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ - int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ - int stride_x; /**< Stride of the image in X dimension (in bytes) */ - int stride_y; /**< Stride of the image in Y dimension (in bytes) */ -} Image; - -/** Structure to hold 3D tensor information */ -typedef struct Tensor3D -{ - __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ - int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ - int stride_x; /**< Stride of the image in X dimension (in bytes) */ - int stride_y; /**< Stride of the image in Y dimension (in bytes) */ - int stride_z; /**< Stride of the image in Z dimension (in bytes) */ -} Tensor3D; - -/** Structure to hold 4D tensor information */ -typedef struct Tensor4D -{ - __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ - int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ - int stride_x; /**< Stride of the image in X dimension (in bytes) */ - int stride_y; /**< Stride of the image in Y dimension (in bytes) */ - int stride_z; /**< Stride of the image in Z dimension (in bytes) */ - int stride_w; /**< Stride of the image in W dimension (in bytes) */ -} Tensor4D; - -/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's - * data. - * - * @param[in] ptr Pointer to the starting postion of the buffer - * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector - * @param[in] stride_x Stride of the vector in X dimension (in bytes) - * @param[in] step_x stride_x * number of elements along X processed per - * workitem(in bytes) - * - * @return An image object - */ -inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, - uint stride_x, uint step_x) -{ - Vector vector = { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - }; - vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; - return vector; -} - -/** Wrap image information into an Image structure, and make the pointer point at this workitem's - * data. - * - * @param[in] ptr Pointer to the starting postion of the buffer - * @param[in] offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] stride_x Stride of the image in X dimension (in bytes) - * @param[in] step_x stride_x * number of elements along X processed per - * workitem(in bytes) - * @param[in] stride_y Stride of the image in Y dimension (in bytes) - * @param[in] step_y stride_y * number of elements along Y processed per - * workitem(in bytes) - * - * @return An image object - */ -inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, - uint stride_x, uint step_x, uint stride_y, uint step_y) -{ - Image img = {.ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y}; - img.ptr += - img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; - return img; -} - -/** Wrap 3D tensor information into an image structure, and make the pointer point at this - * workitem's data. - * - * @param[in] ptr Pointer to the starting postion of the buffer - * @param[in] offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] stride_x Stride of the image in X dimension (in bytes) - * @param[in] step_x stride_x * number of elements along X processed per - * workitem(in bytes) - * @param[in] stride_y Stride of the image in Y dimension (in bytes) - * @param[in] step_y stride_y * number of elements along Y processed per - * workitem(in bytes) - * @param[in] stride_z Stride of the image in Z dimension (in bytes) - * @param[in] step_z stride_z * number of elements along Z processed per - * workitem(in bytes) - * - * @return A 3D tensor object - */ -inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, - uint offset_first_element_in_bytes, - uint stride_x, uint step_x, uint stride_y, - uint step_y, uint stride_z, uint step_z) -{ - Image img = {.ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y}; - img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + - get_global_id(1) * step_y + get_global_id(2) * step_z; - return img; -} - -/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this - * workitem's data. - * - * @param[in] ptr Pointer to the starting postion of the buffer - * @param[in] offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] stride_x Stride of the image in X dimension (in bytes) - * @param[in] step_x stride_x * number of elements along X processed per - * workitem(in bytes) - * @param[in] stride_y Stride of the image in Y dimension (in bytes) - * @param[in] step_y stride_y * number of elements along Y processed per - * workitem(in bytes) - * @param[in] stride_z Stride of the image in Z dimension (in bytes) - * @param[in] step_z stride_z * number of elements along Z processed per - * workitem(in bytes) - * - * @return A 3D tensor object - */ -inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, - uint offset_first_element_in_bytes, uint stride_x, - uint step_x, uint stride_y, uint step_y, uint stride_z, - uint step_z) -{ - Tensor3D tensor = {.ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z}; - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + - get_global_id(1) * step_y + get_global_id(2) * step_z; - return tensor; -} - -inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, - uint offset_first_element_in_bytes, uint stride_x, - uint step_x, uint stride_y, uint step_y, uint stride_z, - uint step_z, uint stride_w, uint step_w, uint mod_size) -{ - Tensor4D tensor = {.ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z, - .stride_w = stride_w}; - - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + - get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + - (get_global_id(2) / mod_size) * step_w; - return tensor; -} - -/** Get the pointer position of a Vector - * - * @param[in] vec Pointer to the starting position of the buffer - * @param[in] x Relative X position - */ -inline __global const uchar *vector_offset(const Vector *vec, int x) -{ - return vec->ptr + x * vec->stride_x; -} - -/** Get the pointer position of a Image - * - * @param[in] img Pointer to the starting position of the buffer - * @param[in] x Relative X position - * @param[in] y Relative Y position - */ -inline __global uchar *offset(const Image *img, int x, int y) -{ - return img->ptr + x * img->stride_x + y * img->stride_y; -} - -/** Get the pointer position of a Tensor3D - * - * @param[in] tensor Pointer to the starting position of the buffer - * @param[in] x Relative X position - * @param[in] y Relative Y position - * @param[in] z Relative Z position - */ -inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) -{ - return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; -} - -/** Get the pointer position of a Tensor4D - * - * @param[in] tensor Pointer to the starting position of the buffer - * @param[in] x Relative X position - * @param[in] y Relative Y position - * @param[in] z Relative Z position - * @param[in] w Relative W position - */ -inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) -{ - return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + - w * tensor->stride_w; -} - -#endif // _HELPER_H diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h deleted file mode 100644 index c39138caa..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_HELPERS_ASYMM_H -#define ARM_COMPUTE_HELPERS_ASYMM_H - -#include "helpers.h" - -/** Correctly-rounded-to-nearest division by a power-of-two. - * - * @param[in] size Size of vector. - * - * @return Correctly-rounded-to-nearest division by a power-of-two. - */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - VEC_DATA_TYPE(int, size) \ - mask = (1 << exponent) - 1; \ - const VEC_DATA_TYPE(int, size) zero = 0; \ - const VEC_DATA_TYPE(int, size) one = 1; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, x < 0); \ - return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ - } - -/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), - * rounding to the nearest value, and saturating -1 * -1 to the maximum value. - * - * @param[in] size Size of vector. - * - * @return Product of two fixed-point numbers. - */ -#define ASYMM_MULT_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(int, size) \ - overflow = a == b && a == INT_MIN; \ - VEC_DATA_TYPE(long, size) \ - a_64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b_64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - ab_64 = a_64 * b_64; \ - /* COMPMID-907 */ \ - VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ - return select(ab_x2_high32, INT_MAX, overflow); \ - } - -/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). - * - * @param[in] size Size of vector. - * - * @return Result in fixed-point format Q0. - */ -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ - a) \ - { \ - const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ - const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ - const int k_fractional_bits = 31; \ - VEC_DATA_TYPE(int, size) \ - x = a + (1 << (k_fractional_bits - 3)); \ - VEC_DATA_TYPE(int, size) \ - x2 = ASYMM_MULT(x, x, size); \ - VEC_DATA_TYPE(int, size) \ - x3 = ASYMM_MULT(x2, x, size); \ - VEC_DATA_TYPE(int, size) \ - x4 = ASYMM_MULT(x2, x2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2 = \ - ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ - ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ - return constant_term + \ - ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ - } - -/** Each bit of the result is set to the corresponding bit of either then_val or - * else_val depending on whether the corresponding bit of if_mask is set. - * Equivalent to the VBSL instruction in ARM NEON. - * - * @param[in] size Size of vector. - * - * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding - * bit in @p if_mask is set or not. - */ -#define ASYMM_SELECT_USING_MASK_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \ - VEC_DATA_TYPE(int, size) then_val, \ - VEC_DATA_TYPE(int, size) else_val) \ - { \ - return (if_mask & then_val) ^ (~if_mask & else_val); \ - } - -/** For each element of input vector, the corresponding bits of the result item are set - * if the input item is zero. - * - * @param[in] size Size of vector. - * - * @returns Output vector with bits set when corresponding bit in @p a is zero. - */ -#define ASYMM_MASK_IF_ZERO_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) all_zeros = 0; \ - const VEC_DATA_TYPE(int, size) all_ones = ~0; \ - return select(all_zeros, all_ones, a == 0); \ - } - -/** For each element of input vector, the corresponding bits of the result item are set - * if the input item is non-zero. - * - * @param[in] size Size of vector. - * - * @returns Output vector with bits set when corresponding bit in @p a is non zero. - */ -#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) all_zeros = 0; \ - const VEC_DATA_TYPE(int, size) all_ones = ~0; \ - return select(all_zeros, all_ones, a != 0); \ - } - -#define EXP_BARREL_SHIFTER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ - VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ - int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ - { \ - if (k_integer_bits > exponent) \ - { \ - const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ - return ASYMM_SELECT_USING_MASK( \ - ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ - ASYMM_MULT(result, fp_multiplier, size), result, size); \ - } \ - \ - return result; \ - } - -/** Calculates \f$ exp(x) \f$ for x < 0. - * - * @param[in] size Size of vector. - * - * @return Result in fixed-point format Q0. - */ -#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ - { \ - const int k_fractional_bits = 31 - k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - k_one_quarter = 1 << (k_fractional_bits - 2); \ - VEC_DATA_TYPE(int, size) \ - mask = k_one_quarter - 1; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ - a_mod_quarter_minus_one_quarter_scaled, size); \ - VEC_DATA_TYPE(int, size) \ - remainder = a_mod_quarter_minus_one_quarter - a; \ - \ - result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ - remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ - size); \ - result = \ - EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ - \ - if (k_integer_bits > 5) \ - { \ - const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ - result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ - return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ - } - -/** Calculates the product of a integer value by a power of two, with either a positive exponent - * (equivalent to an arithmetic left shift, saturating) or a negative exponent - * (equivalent to an arithmetic right shift, rounding to nearest). - * - * @param[in] size Size of vector. - * - * @return Arithmetic left or right shift. - */ -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - if (exponent < 0) \ - { \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) min = INT_MIN; \ - const VEC_DATA_TYPE(int, size) max = INT_MAX; \ - int threshold = ((1 << (31 - exponent)) - 1); \ - VEC_DATA_TYPE(int, size) \ - positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ - VEC_DATA_TYPE(int, size) \ - negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ - VEC_DATA_TYPE(int, size) \ - result = x << exponent; \ - result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ - result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ - return result; \ - } - -/** Calculates (a+b)/2, rounded to the nearest integer. - * Equivalent to VRHADD in the ARM NEON instruction set. - * - * @param[in] size Size of vector. - * - * @return (a+b)/2, rounded to the nearest integer. - */ -#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(long, size) \ - a64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - sum = a64 + b64; \ - const VEC_DATA_TYPE(long, size) one = 1; \ - const VEC_DATA_TYPE(long, size) minus_one = -1; \ - VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, sum >= 0); \ - return convert_int##size((sum + sign) / 2); \ - } - -/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). - * - * @param[in] size Size of vector. - * - * @return Result in fixed-point format Q0. - */ -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ - const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ - VEC_DATA_TYPE(int, size) \ - half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ - const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ - const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ - VEC_DATA_TYPE(int, size) \ - x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ - for (int i = 0; i < 3; i++) \ - { \ - VEC_DATA_TYPE(int, size) \ - half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ - VEC_DATA_TYPE(int, size) \ - one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ - VEC_DATA_TYPE(int, size) \ - tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ - x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ - } \ - return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ - } - -/** Considering the integer value as fixed-point, change the number of integer bits and update value - * accordingly. - * - * @param[in] size Size of vector. - * - * @return Rescaled value. - */ -#define ASYMM_RESCALE_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \ - int src_integer_bits, int dst_integer_bits) \ - { \ - int exponent = src_integer_bits - dst_integer_bits; \ - return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ - } - -#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ - asymm_rounding_divide_by_POW2_##size(x, exponent) -#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) -#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ - ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ - asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) -#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ - asymm_select_using_mask##size(if_mask, then_val, else_val) -#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) -#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) -#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ - remainder, size) \ - exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ - remainder) -#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \ - asymm_exp_on_negative_values##size(a, k_integer_bits) -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \ - asymm_one_over_one_plus_x_for_x_in_0_1##size(a) -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ - asymm_saturating_rounding_mult_by_pow2##size(x, exponent) -#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) -#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ - asymm_rescale##size(value, src_integer_bits, dst_integer_bits) - -ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) -ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) -ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) -ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) - -ASYMM_MULT_IMPL(2) -ASYMM_MULT_IMPL(4) -ASYMM_MULT_IMPL(8) -ASYMM_MULT_IMPL(16) - -ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) -ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) -ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) -ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) - -ASYMM_SELECT_USING_MASK_IMPL(2) -ASYMM_SELECT_USING_MASK_IMPL(4) -ASYMM_SELECT_USING_MASK_IMPL(8) -ASYMM_SELECT_USING_MASK_IMPL(16) - -ASYMM_MASK_IF_ZERO_IMPL(2) -ASYMM_MASK_IF_ZERO_IMPL(4) -ASYMM_MASK_IF_ZERO_IMPL(8) -ASYMM_MASK_IF_ZERO_IMPL(16) - -ASYMM_MASK_IF_NON_ZERO_IMPL(2) -ASYMM_MASK_IF_NON_ZERO_IMPL(4) -ASYMM_MASK_IF_NON_ZERO_IMPL(8) -ASYMM_MASK_IF_NON_ZERO_IMPL(16) - -EXP_BARREL_SHIFTER_IMPL(2) -EXP_BARREL_SHIFTER_IMPL(4) -EXP_BARREL_SHIFTER_IMPL(8) -EXP_BARREL_SHIFTER_IMPL(16) - -ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) -ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) -ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) -ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) - -ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) -ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) -ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) -ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) - -ASYMM_ROUNDING_HALF_SUM_IMPL(2) -ASYMM_ROUNDING_HALF_SUM_IMPL(4) -ASYMM_ROUNDING_HALF_SUM_IMPL(8) -ASYMM_ROUNDING_HALF_SUM_IMPL(16) - -ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) -ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) -ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) -ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) - -ASYMM_RESCALE_IMPL(2) -ASYMM_RESCALE_IMPL(4) -ASYMM_RESCALE_IMPL(8) -ASYMM_RESCALE_IMPL(16) - -#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl deleted file mode 100644 index e3aa463db..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Performs a negation of input tensor. - * - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * - * @param[in] in_ptr Pointer to the source image. Supported data types: S16/S32/F16/F32. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - */ -__kernel void neg_tensor( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl deleted file mode 100644 index ecf4696e9..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4 - * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * - * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for - * batch), Top(for height), Left(for width) and Front(for depth) are - * required. Supported data type: S32 - */ - -__kernel void pad( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int4 pad_values) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int index[4]={0}; - - index[0] = get_global_id(0);//W - index[1] = get_global_id(1);//H - index[2] = get_global_id(2) % DEPTH_OUT;//C - index[3] = get_global_id(2) / DEPTH_OUT;//N - - if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) || - index[1] < pad_values.y || index[1] >= (IH + pad_values.y) || - index[2] < pad_values.z || index[2] >= (ID + pad_values.z) || - index[3] < pad_values.w || index[3] >= (IB + pad_values.w)) - { - *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; - } - else - { - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *) - tensor4D_offset(&in, index[0] - pad_values.x, - index[1] - pad_values.y, - index[2] - pad_values.z, - index[3] - pad_values.w)); - } - } - -#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl deleted file mode 100644 index 7cc8b0354..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) -/** Perform a Generic permute operation on an input tensor of Shape DCHW. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1 -6/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in b -ytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in b -ytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in b -ytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void permute_generic( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - int out_index[4]; - int in_index[4]; - in_index[0] = get_global_id(0);//W - in_index[1] = get_global_id(1);//H - in_index[2] = get_global_id(2) % DEPTH_IN;//C - in_index[3] = get_global_id(2) / DEPTH_IN;//B - out_index[0] = in_index[P1]; - out_index[1] = in_index[P2]; - out_index[2] = in_index[P3]; - out_index[3] = in_index[P4]; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); -} -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl deleted file mode 100644 index aa05121b1..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifdef SATURATE -#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) -#else /* SATURATE */ -#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) -#endif /* SATURATE */ -#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) - -/** Performs a pixelwise division with float scale of either integer or float inputs. - * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short - * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. - * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. - * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided. - * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Float scaling factor. Supported data types: F32 - */ -__kernel void pixelwise_div_float( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out), - const float scale) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(DATA_TYPE_RES, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); - VEC_DATA_TYPE(DATA_TYPE_RES, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); - - // Perform division -#ifdef DATA_TYPE_FLOAT - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); -#else /* DATA_TYPE_FLOAT */ - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND); -#endif /* DATA_TYPE_FLOAT */ - - // Store result - vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl deleted file mode 100644 index fdfb78003..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(SATURATE) -#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x)) -#else // SATURATE -#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x)) -#endif // SATURATE -#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size) - -#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size) - -/** Performs a pixelwise division with integer scale of integer inputs. - * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short - * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. - * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. - * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/S16 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Integer scaling factor. Supported data types: S32 - */ -__kernel void pixelwise_div_int( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out), - const uint scale) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(DATA_TYPE_RES, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); - VEC_DATA_TYPE(DATA_TYPE_RES, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); - - // Perform division and store result - vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl deleted file mode 100644 index ab1307e64..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers_asymm.h" - -#ifdef SATURATE -#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) -#else /* SATURATE */ -#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) -#endif /* SATURATE */ -#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) - -#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) -/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 - * - * The following computations will be performed by the kernel: - * - * -# Add offset terms to inputs - * -# Multiply inputs - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Shift the int32 accumulator by result_shift - * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. - * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET - * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT - * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8 - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Float scaling factor. Supported data types: F32 - */ -__kernel void pixelwise_mul_qasymm8( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out), - const float scale) -{ - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - - // Load data - VEC_DATA_TYPE(int, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); - VEC_DATA_TYPE(int, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); - - // Perform multiplication of two inputs - VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); - VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); - VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; - - // Multiply with a multiplier smaller than 1 - out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); - out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); - - VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); - -// TODO: Apply min-max BOUND to support fuse with relu. -/* -#if defined(MIN_BOUND) - res = max(res, (uchar16)MIN_BOUND); -#endif // defined(MIN_BOUND) -#if defined(MAX_BOUND) - res = min(res, (uchar16)MAX_BOUND); -#endif // defined(MAX_BOUND) -*/ - - // Store result - VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), - 0, (__global DATA_TYPE_OUT *)out.ptr); -} -#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl deleted file mode 100644 index 68da2ba32..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Can only take floating point data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported Data types : F16/F32 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[in] alpha_ptr Pointer to the source image. Supported Data types : F16/F32 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void prelu( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ? - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) : - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), - 0, (__global DATA_TYPE *)output.ptr); - -} -#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl deleted file mode 100644 index 7e97b7ed6..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" -#define SUB(x, y) (x) - (y) - -#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) - -#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) -#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) -#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) -#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) - -/** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. - * - * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Can only take uchar data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported Data types : QASYMM8 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[in] alpha_ptr Pointer to the source image. Supported Data types : QASYMM8 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void prelu_qasymm8( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); - VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); - - in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1)); - in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2)); - - const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); - const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); - const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32; - const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); - const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); - - VSTORE(VEC_SIZE) - (res, 0, (__global uchar *)output.ptr); -} - -#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl deleted file mode 100644 index 8bef49363..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) -/** Perform reduce max/min - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] axis Axis through which reduction occurs - * @param[in] dim Dimension across the axis to be reduced. - */ -__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - for(int i = 1; i < dim; ++i) - { - indices[axis] = i; - - #if OP_CODE == 1 // REDUCE_MAX - value = max(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - - #elif OP_CODE == 2 // REDUCE_MIN - value = min(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - - #else // OP NOT SUPPORTED - return; - - #endif - } - - *((__global DATA_TYPE *)out.ptr) = value; -} - -/** Perform reduce sum/mean - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] axis Axis through which reduction occurs - * @param[in] dim Dimension across the axis to be reduced. - */ -__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE sum_value = (DATA_TYPE)0; - for(int i = 0; i < dim; ++i) - { - indices[axis] = i; - sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - } - - #if OP_CODE == 3 // REDUCE_SUM - *((__global DATA_TYPE *)out.ptr) = sum_value; - - #elif OP_CODE == 4 // REDUCE_MEAN - *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); - - #else // OP NOT SUPPORTED - return; - - #endif -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl deleted file mode 100644 index a0fc2d5a9..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) -/** Perform space to batch with input of 4D and NCHW format - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 - * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 - * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 - * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(block_size), - IMAGE_DECLARATION(padding_size)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int block_size_x = *((__global int *)(block_size_ptr)); - int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); - int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; - int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; - - int in_index[4] = {0, }; - in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); - in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); - in_index[2] = get_global_id(2) % DEPTH_OUT; - in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; - - if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) - { - *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; - } - else - { - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); - } -} -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) - -#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) -/** Perform space to batch with input of 4D and NHWC format - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 - * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 - * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 - * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 - * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(block_size), - IMAGE_DECLARATION(padding_size)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); - - int block_size_x = *((__global int *)(block_size_ptr)); - int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); - int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; - int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; - - int in_index[4] = {0, }; - in_index[0] = get_global_id(0) * VEC_SIZE; - in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); - in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); - in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; - - if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) - { - VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); - } - else - { - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); - } -} - -#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl deleted file mode 100644 index f6977045a..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) -/** Perform space to depth rearrangement of tensor - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void space_to_depth( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - int out_index[4]={0}; - int in_index[4]={0}; - - in_index[0] = get_global_id(0);//W - in_index[1] = get_global_id(1);//H - in_index[2] = get_global_id(2) % DEPTH_IN;//C - in_index[3] = get_global_id(2) / DEPTH_IN;//B - - out_index[0] = in_index[0]/BLOCK_SIZE; - out_index[1] = in_index[1]/BLOCK_SIZE; - out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; - out_index[3] = in_index[3]; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); - } -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl deleted file mode 100644 index 3e1a5c97f..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#ifndef VEC_SIZE -#define VEC_SIZE 1 -#endif - -#if defined(DATA_TYPE) -/** Returns true value of squared_difference of two tensors. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Can only take floating point data types. - * - * @param[in] input1_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[in] input2_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16/F32 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void squared_difference( - TENSOR3D_DECLARATION(input1), - TENSOR3D_DECLARATION(input2), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); - Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr); - - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - sq_diff = diff * diff; - - VSTORE(VEC_SIZE) - (sq_diff, 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl deleted file mode 100644 index b39c55b96..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "helpers.h" - -#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) -/** Extracts a strided slice up to 4-dimensions - * - * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 - * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 - */ -__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int4 starts, - const int4 strides) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int4 indices_in = - { - starts.x + (strides.x * get_global_id(0)), - starts.y + (strides.y * get_global_id(1)), - starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)), - starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)), - }; - *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); -} -#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl deleted file mode 100644 index d97f23a47..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.h" - -__kernel void topkv2_init(VECTOR_DECLARATION(input), - __global float* in_key_buf, - __global int* in_ind_buf, - const int n) -{ - int gid = get_global_id(0); - int lws = get_local_size(0); - int groups = get_num_groups(0); - int gws = lws * groups; - int iter = n / gws; - - Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); - - for(int i = 0; i < iter; ++i) - { - int idx = i * gws + gid; - in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x); - in_ind_buf[idx] = idx; - } -} - -__kernel void topkv2_find_first_negative( - __global float *out_key_buf, - __global int *first_negative_idx, - int n) -{ - int gid = get_global_id(0); - - if( gid == n - 1 ) - { - // if the last item is positive, the first negative index is n. - if( out_key_buf[gid] > 0.f ) - *first_negative_idx = n; - } else if ( gid == 0 ) { - // if the first item is negative, set it 0. - if( out_key_buf[gid] < 0.f ) - *first_negative_idx = 0; - } else { - // if its left is positive and it is negative, then it is the first negative item. - if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f ) - *first_negative_idx = gid; - } -} - -__kernel void topkv2_reorder_negatives( - __global float* in_key_buf, - __global float* out_key_buf, - __global float* in_ind_buf, - __global float* out_ind_buf, - __global int* first_negative_idx, - int n) -{ - int gid = get_global_id(0); - - int num_negs = n - *first_negative_idx; - int in_idx; - - if( gid < num_negs ) { - in_idx = n - 1 - gid; - } else { - in_idx = gid - num_negs; - } - - out_key_buf[gid] = in_key_buf[in_idx]; - out_ind_buf[gid] = in_ind_buf[in_idx]; -} - -__kernel void topkv2_store( - VECTOR_DECLARATION(values), - VECTOR_DECLARATION(indices), - __global float *out_key_buf, - __global int *out_ind_buf, - int n) -{ - int gid = get_global_id(0); - - Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values); - Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); - - int idx = n - 1 - gid; - - *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; - *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl deleted file mode 100644 index 0292fab04..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.h" - -__global inline float* get_vec_elem(Vector* vec, int idx) -{ - return (__global float*)(vec->ptr + idx * vec->stride_x); -} - -__global inline int* get_vec_elem_int(Vector* vec, int idx) -{ - return (__global int*)(vec->ptr + idx * vec->stride_x); -} - -// A utility function to swap two elements -void swap(__global float *a, __global float *b) -{ - float t = *a; - *a = *b; - *b = t; -} - -void swap_idx(__global int *a, __global int *b) -{ - int t = *a; - *a = *b; - *b = t; -} - -/* This function is same in both iterative and recursive*/ -int partition (Vector* arr, __global int* indices, int l, int h) -{ - float x = *get_vec_elem(arr, h); - int i = (l - 1); - - for (int j = l; j <= h- 1; j++) - { - if (*get_vec_elem(arr, j) >= x) - { - i++; - swap (get_vec_elem(arr,i), get_vec_elem(arr,j)); - swap_idx(&indices[i], &indices[j]); - } - } - swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); - swap_idx(&indices[i + 1], &indices[h]); - return (i + 1); -} - -/* A[] --> Array to be sorted, - l --> Starting index, - h --> Ending index */ -void quickSortIterative (Vector* arr, __global int* indices, - __global int *stack, int l, int h) -{ - // Create an auxiliary stack - - // initialize top of stack - int top = -1; - - // push initial values of l and h to stack - stack[ ++top ] = l; - stack[ ++top ] = h; - - // Keep popping from stack while is not empty - while ( top >= 0 ) - { - // Pop h and l - h = stack[ top-- ]; - l = stack[ top-- ]; - - // Set pivot element at its correct position - // in sorted array - int p = partition( arr, indices, l, h ); - - // If there are elements on left side of pivot, - // then push left side to stack - if ( p-1 > l ) - { - stack[ ++top ] = l; - stack[ ++top ] = p - 1; - } - - // If there are elements on right side of pivot, - // then push right side to stack - if ( p+1 < h ) - { - stack[ ++top ] = p + 1; - stack[ ++top ] = h; - } - } -} - -__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices), - __global int* indices, __global int* temp_stack, int k, int n) -{ - Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); - Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); - Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); - - for( int i = 0; i < n; ++i ) - { - indices[i] = i; - } - - quickSortIterative(&input, indices, temp_stack, 0, n-1); - - // extract k items. - for(int i = 0; i < k; ++i) - { - *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); - *get_vec_elem_int(&topk_indices, i) = indices[i]; - } -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl deleted file mode 100644 index c2c2d89a4..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// reference: -// https://code.google.com/archive/p/ocl-radix-sort/source/default/source -// OpenCL kernel sources for the CLRadixSort class -// the #include does not exist in OpenCL -// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr -// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html -// if you find this software usefull you can cite the following work in your reports or articles: -// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. -// http://hal.archives-ouvertes.fr/hal-00596730 - -// Reference for floating point radix sort: -// http://www.codercorner.com/RadixSortRevisited.htm - -// compute the histogram for each radix and each virtual processor for the pass -__kernel void radixsort_histogram(__global float* in_key_buf, - __global int* d_Histograms, - const int pass, - __local int* loc_histo, - const int n) -{ - int it = get_local_id(0); // i local number of the processor - int ig = get_global_id(0); // global number = i + g I - - int gr = get_group_id(0); // g group number - - int groups = get_num_groups(0); - int items = get_local_size(0); - - // set the local histograms to zero - for(int ir=0;ir<_RADIX;ir++){ - loc_histo[ir * items + it] = 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // range of keys that are analyzed by the work item - int size= n/groups/items; // size of the sub-list - int start= ig * size; // beginning of the sub-list - - unsigned int key; - int shortkey,k; - - // compute the index - // the computation depends on the transposition - for(int j = 0; j < size ; j++) { -#ifdef TRANSPOSE - k= groups * items * j + ig; -#else - k=j+start; -#endif - - key = *((__global unsigned int*)(in_key_buf + k)); - - // extract the group of _BITS bits of the pass - // the result is in the range 0.._RADIX-1 - shortkey=(( key >> (pass * _BITS)) & (_RADIX-1)); - - // increment the local histogram - loc_histo[shortkey * items + it ]++; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // copy the local histogram to the global one - for(int ir=0;ir<_RADIX;ir++) { - d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; - } - - barrier(CLK_GLOBAL_MEM_FENCE); -} - -// initial transpose of the list for improving -// coalescent memory access -__kernel void transpose(const __global int* invect, - __global int* outvect, - const int nbcol, - const int nbrow, - const __global int* inperm, - __global int* outperm, - __local int* blockmat, - __local int* blockperm, - const int tilesize){ - - int i0 = get_global_id(0)*tilesize; // first row index - int j = get_global_id(1); // column index - - int jloc = get_local_id(1); // local column index - - // fill the cache - for(int iloc=0;iloc<tilesize;iloc++){ - int k=(i0+iloc)*nbcol+j; // position in the matrix - blockmat[iloc*tilesize+jloc]=invect[k]; -#ifdef PERMUT - blockperm[iloc*tilesize+jloc]=inperm[k]; -#endif - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // first row index in the transpose - int j0=get_group_id(1)*tilesize; - - // put the cache at the good place - for(int iloc=0;iloc<tilesize;iloc++){ - int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose - outvect[kt]=blockmat[jloc*tilesize+iloc]; -#ifdef PERMUT - outperm[kt]=blockperm[jloc*tilesize+iloc]; -#endif - } - -} - -// each virtual processor reorders its data using the scanned histogram -__kernel void radixsort_reorder(__global float* in_key, - __global float* out_key, - __global int* d_Histograms, - const int pass, - __global int* indices_in, - __global int* indices_out, - __local int* loc_histo, - const int n){ - - int it = get_local_id(0); - int ig = get_global_id(0); - - int gr = get_group_id(0); - int groups=get_num_groups(0); - int items=get_local_size(0); - - int start= ig *(n/groups/items); - int size= n/groups/items; - - // take the histogram in the cache - for(int ir=0;ir<_RADIX;ir++){ - loc_histo[ir * items + it]= - d_Histograms[items * (ir * groups + gr) + it]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int newpos,shortkey,k,newpost; - unsigned int key; - - for(int j= 0; j< size;j++){ -#ifdef TRANSPOSE - k= groups * items * j + ig; -#else - k=j+start; -#endif - float org_value = in_key[k]; - key = *(__global unsigned int*)(in_key + k); - shortkey=((key >> (pass * _BITS)) & (_RADIX-1)); - - newpos=loc_histo[shortkey * items + it]; - -#ifdef TRANSPOSE - int ignew,jnew; - ignew= newpos/(n/groups/items); - jnew = newpos%(n/groups/items); - newpost = jnew * (groups*items) + ignew; -#else - newpost=newpos; -#endif - - //d_outKeys[newpost]= key; // killing line !!! - out_key[newpost] = org_value; - -#ifdef PERMUT - indices_out[newpost] = indices_in[k]; -#endif - - newpos++; - loc_histo[shortkey * items + it]=newpos; - } -} - -// perform a parallel prefix sum (a scan) on the local histograms -// (see Blelloch 1990) each workitem worries about two memories -// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html -__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum) -{ - int it = get_local_id(0); - int ig = get_global_id(0); - int decale = 1; - int n=get_local_size(0) * 2 ; - int gr=get_group_id(0); - - // load input into local memory - // up sweep phase - temp[2*it] = histo[2*ig]; - temp[2*it+1] = histo[2*ig+1]; - - // parallel prefix sum (algorithm of Blelloch 1990) - for (int d = n>>1; d > 0; d >>= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - if (it < d){ - int ai = decale*(2*it+1)-1; - int bi = decale*(2*it+2)-1; - temp[bi] += temp[ai]; - } - decale *= 2; - } - - // store the last element in the global sum vector - // (maybe used in the next step for constructing the global scan) - // clear the last element - if (it == 0) { - globsum[gr]=temp[n-1]; - temp[n - 1] = 0; - } - - // down sweep phase - for (int d = 1; d < n; d *= 2){ - decale >>= 1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (it < d){ - int ai = decale*(2*it+1)-1; - int bi = decale*(2*it+2)-1; - - int t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; - } - - } - barrier(CLK_LOCAL_MEM_FENCE); - - // write results to device memory - - histo[2*ig] = temp[2*it]; - histo[2*ig+1] = temp[2*it+1]; - - barrier(CLK_GLOBAL_MEM_FENCE); - -} - -// use the global sum for updating the local histograms -// each work item updates two values -__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum) -{ - int ig = get_global_id(0); - int gr=get_group_id(0); - - int s; - - s=globsum[gr]; - - // write results to device memory - histo[2*ig] += s; - histo[2*ig+1] += s; - - barrier(CLK_GLOBAL_MEM_FENCE); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp deleted file mode 100644 index 1fdd2f98f..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/UtilsEx.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfoEx &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::F16, DataType::F32); - - // Checks performed when output is configured - if ((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - if (output != nullptr) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, *input); - } - - const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - bool window_changed = false; - - if (output != nullptr) - { - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->valid_region()); - } - else - { - window_changed = update_window_and_padding( - win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); - } - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLActivationLayerExKernel::CLActivationLayerExKernel() - : _input(nullptr), _output(nullptr), _run_in_place(false) -{ -} - -void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output, - ActivationLayerInfoEx act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input); - - _run_in_place = (output == nullptr) || (output == input); - - if (output != nullptr) - { - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); - } - - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info)); - - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); - const DataType dt = input->info()->data_type(); - float a_const = act_info.a(); - float b_const = act_info.b(); - int a_const_int = 0; - int b_const_int = 0; - - // Create quantized version of constants a, b if needed - if (is_data_type_quantized(dt)) - { - a_const_int = - input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP); - b_const_int = - input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP); - } - - // Set build options - std::set<std::string> build_opts; - build_opts.emplace( - ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation())))); - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - if (is_data_type_quantized(dt)) - { - build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int))); - build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int))); - - const int o1 = input->info()->quantization_info().offset; - // Quantized value of 0 corresponds to the offset o1 - build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1))); - - // Set scale and offset of the input and output if they have different quantization info - if (is_data_type_quantized_asymmetric(dt) && output != nullptr) - { - const float s1 = input->info()->quantization_info().scale; - const float s2 = output->info()->quantization_info().scale; - const int o2 = output->info()->quantization_info().offset; - - if (o1 != o2 || s1 != s2) - { - build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1))); - build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2))); - build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1))); - build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2))); - } - } - } - else - { - build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); - build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); - } - - build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : ""); - - // Create kernel - std::string kernel_name = std::string("activation_layer_ex"); - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Make sure _kernel is initialized before calling the parent's configure - _input = input; - _output = output; - - // Configure kernel window - auto win_config = - validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Set config_id for enabling LWS tuning - _config_id = "activation_layer_ex_"; - _config_id += lower_string(string_from_data_type(dt)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfoEx &act_info) -{ - const bool run_in_place = (output == nullptr) || (output == input); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get()) - .first); - - return Status{}; -} - -void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - if (!_run_in_place) - { - add_3D_tensor_argument(idx, _output, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp deleted file mode 100644 index c1a2ad0be..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis) -{ - TensorShape out_shape{input_shape}; - - out_shape.set(argminmax_axis, 1); - - return out_shape; -} -} // namespace - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t argminmax_axis, ArgOperation op) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32, - DataType::U8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Inputs are not broadcast compatible"); - - const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), - "output shape's size does not match argminmax_axis"); - - const auto num_dimensions = input->tensor_shape().num_dimensions(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - argminmax_axis >= 0 && argminmax_axis < num_dimensions, - "argminmax_axis must be greater than or equal to 0 and less than (input's rank)."); - return Status{}; -} - -} // namespace - -CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {} - -void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output, - const uint32_t argminmax_axis, ArgOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis)); - - _input = input; - _output = output; - _argminmax_axis = argminmax_axis; - - std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); - output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis)); - - // Construct kernel name for argmax and argmin based on axis - std::string kernel_name = "arg_op"; - int op_code = 0; - if (op == ArgOperation::MAX) - { - op_code = 1; - } - else if (op == ArgOperation::MIN) - { - op_code = 2; - } - else - throw std::runtime_error("Operation not supported, yet"); - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); - build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output_info, Steps()); - - Coordinates coord; - coord.set_num_dimensions(output_info->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t argminmax_axis, ArgOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op)); - - return Status{}; -} - -void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &shape_in = _input->info()->tensor_shape(); - - unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters - - _kernel.setArg<cl_int>(idx++, _argminmax_axis); - _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]); - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup input slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - // Copy output's shape in order to use for recovering at end of this method - const TensorShape shape_out = _output->info()->tensor_shape(); - _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); - - // Recover output's shape of output tensor - _output->info()->set_tensor_shape(shape_out); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp deleted file mode 100644 index 1c505b4d5..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, ConvertPolicy policy) -{ - ARM_COMPUTE_UNUSED(policy); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->data_type() == DataType::U8 && - (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), - "Output can only be U8 if both inputs are U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, - ITensorInfo *output) -{ - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output, out_shape); - - if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) - { - set_format_if_unknown(*output, Format::S16); - } - else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16) - { - set_format_if_unknown(*output, Format::F16); - } - else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) - { - set_format_if_unknown(*output, Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2); - - AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input1->info(), input2->info(), output->info(), policy)); - - // Configure kernel window - auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - - _input1 = input1; - _input2 = input2; - _output = output; - - const bool has_float_out = is_data_type_float(output->info()->data_type()); - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); - build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts)); - - ICLKernel::configure_internal(win_config.second); -} - -Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1, - const ITensorInfo *input2, - const ITensorInfo *output, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), - input2->clone().get(), - output->clone().get()) - .first); - - return Status{}; -} - -void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLArithmeticSubtractionExKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp deleted file mode 100644 index b0016d23c..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const int32_t *block_size) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1, - "Block size should be greater than or equal to 1."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2), - "Input Depth should be equal to Output Depth"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3), - "Input batch should be equal to (output batch * block size[0] *block size[1])"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) && - !(output->dimension(1) % block_size[0]), - "Output height and width should be divisible by block size[0] " - "and block_size[1] respectively"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) && - (output->dimension(1) == input->dimension(1) * block_size[0]), - "Output height and width should be equal to " - "input_height*blocksize[0] and input_width*blocksize[1] " - "respectively"); - - return Status{}; -} - -} // namespace - -CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {} - -void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output, - const int32_t *block_size) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); - - _input = input; - _output = output; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0])); - build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1])); - build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3))); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_out(slice_in); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_out.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_out); - add_4D_tensor_argument(idx, _output, slice_in); - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp deleted file mode 100644 index 3d2f2c702..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, - DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output, BinaryLogicalOperation op) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Create kernel - std::string kernel_name = "binary_logical_op"; - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); - - int op_code = 0; - switch (op) - { - case BinaryLogicalOperation::AND: - op_code = 1; - break; - case BinaryLogicalOperation::OR: - op_code = 2; - break; - default: - throw std::runtime_error("Operation not supported, yet"); - } - - build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); - - AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLBinaryLogicalOpKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp deleted file mode 100644 index bf7ebae3f..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLCastKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {} - -void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - - _input = input; - _output = output; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - // Create kernel - if (is_data_type_quantized_asymmetric(input->info()->data_type())) - { - const float scale_in = input->info()->quantization_info().scale; - const int offset_in = input->info()->quantization_info().offset; - build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); - - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts)); - } - else if (is_data_type_quantized_asymmetric(output->info()->data_type())) - { - const float scale_in = output->info()->quantization_info().scale; - const int offset_in = output->info()->quantization_info().offset; - build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); - - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts)); - } - else - { - _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts)); - } - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); -} - -void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp deleted file mode 100644 index 5af5b16ea..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16, - DataType::S16, DataType::F16, DataType::S32, - DataType::F32, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16, - DataType::S16, DataType::F16, DataType::S32, - DataType::F32, DataType::QASYMM8); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output, const ComparisonOperation &op) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Create kernel - std::string kernel_name = "comparison_op"; - int op_code = 0; - - switch (op) - { - case ComparisonOperation::EQUAL: - op_code = 1; - break; - case ComparisonOperation::NOT_EQUAL: - op_code = 2; - break; - default: - throw std::runtime_error(" Operation not supported, yet"); - } - - std::set<std::string> build_opts; - build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); - build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type()))); - build_opts.emplace( - ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - if (is_data_type_quantized_asymmetric(input1->info()->data_type()) && - ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) || - (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale))) - { - build_opts.emplace("-DOFFSET_IN1=" + - support::cpp11::to_string(input1->info()->quantization_info().offset)); - build_opts.emplace("-DOFFSET_IN2=" + - support::cpp11::to_string(input2->info()->quantization_info().offset)); - build_opts.emplace("-DSCALE_IN1=" + - support::cpp11::to_string(input1->info()->quantization_info().scale)); - build_opts.emplace("-DSCALE_IN2=" + - support::cpp11::to_string(input2->info()->quantization_info().scale)); - kernel_name += "_qasymm8"; - } - - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output->info(), out_shape); - - if (input1->info()->data_type() == DataType::S16 || - input2->info()->data_type() == DataType::S16) - { - set_format_if_unknown(*output->info(), Format::S16); - } - else if (input1->info()->data_type() == DataType::F16 && - input2->info()->data_type() == DataType::F16) - { - set_format_if_unknown(*output->info(), Format::F16); - } - else if (input1->info()->data_type() == DataType::F32 || - input2->info()->data_type() == DataType::F32) - { - set_format_if_unknown(*output->info(), Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); - - AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLComparisonOpKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp deleted file mode 100644 index c386e3312..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const int32_t block_size) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, - "Block size should be greater than or equal to 1."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size, - "Output width should be equal to (Input width * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size, - "Output height should be equal to (Input height * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0, - "Input depth should be divisible by (block size * block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->dimension(2) == input->dimension(2) / (block_size * block_size), - "Output depth should be equal to (Input depth / (block size * block size))"); - - return Status{}; -} -} // namespace - -CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) -{ - // DO NOTHING -} - -void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, - const int32_t block_size) -{ - - _input = input; - _output = output; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup input slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp deleted file mode 100644 index 0862b78bf..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - input_access.set_valid_region(win, output->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() - : _input(nullptr), _output(nullptr), _lookups(nullptr) -{ -} - -Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *lookups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); - ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); - - return Status{}; -} - -void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, - const ICLTensor *lookups) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); - - _input = input; - _output = output; - _lookups = lookups; - - // Set kernel build options - std::stringstream kernel_name; - std::set<std::string> build_opts; - kernel_name << "embedding_lookup"; - - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); -} - -void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - Window win_lookup; - win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_in); - add_1D_tensor_argument(idx, _lookups, win_lookup); - - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp deleted file mode 100644 index b1ee21bdc..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLExpKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {} - -void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - // Auto initialize output - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), - input->info()->quantization_info()); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - _input = input; - _output = output; - - constexpr unsigned int num_elems_processed_per_iteration = 4; - - // Create kernel - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); -} - -void CLExpKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp deleted file mode 100644 index ae2801e2b..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLGatherKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 1; - -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32, - DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - - return Status{}; -} - -} // namespace - -CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {} - -void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Construct kernel name - std::string kernel_name = "gather"; - if (input1->info()->num_dimensions() == 1) - { - kernel_name = "gather_1d"; - } - else if (input1->info()->num_dimensions() == 2) - { - if (_output->info()->num_dimensions() == 1) - { - kernel_name = "gather_1d_out"; - } - } - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration)); - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output)); - - return Status{}; -} - -void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - if (_input1->info()->num_dimensions() == 1) - { - Window slice = window.first_slice_window_1D(); - - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input1, slice); - add_1D_tensor_argument(idx, _input2, slice); - add_1D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } - else if (_input1->info()->num_dimensions() == 2) - { - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); - Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX); - - // Set inputs - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input1, window_collapsed); - add_1D_tensor_argument(idx, _input2, slice); - if (_output->info()->num_dimensions() == 1) - { - add_1D_tensor_argument(idx, _output, slice); - } - else - { - add_2D_tensor_argument(idx, _output, window_collapsed); - } - enqueue(queue, *this, slice); - } -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp deleted file mode 100644 index cd7b21c6d..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - input_access.set_valid_region(win, output->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLHashtableLookupKernel::CLHashtableLookupKernel() - : _input(nullptr), _output(nullptr), _lookups(nullptr) -{ -} - -Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, - const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *hits) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Output's shape was not set"); - - ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) && - output->dimension(output->num_dimensions() - 1) == lookups->dimension(0)); - ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); - ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); - ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); - ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); - - return Status{}; -} - -void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, - const ICLTensor *input, ICLTensor *output, ICLTensor *hits) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); - - _lookups = lookups; - _keys = keys; - _input = input; - _output = output; - _hits = hits; - - // Make _lookup_indices tensor - _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>(); - _lookup_indices->allocator()->init( - TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); - _lookup_indices->allocator()->allocate(); - - // Set kernel build options - std::stringstream kernel_name; - std::set<std::string> build_opts; - kernel_name << "hashtable_lookup"; - - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); -} - -void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const_cast<ICLTensor *>(_lookups)->map(queue); - const_cast<ICLTensor *>(_keys)->map(queue); - _hits->map(queue); - _lookup_indices->map(queue); - - // Set values of hits - const int32_t *lookups_buf = - reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); - const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); - uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); - int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); - - std::map<int32_t, size_t> key_map; - const size_t keys_num = _keys->info()->dimension(0); - for (size_t key_index = 0; key_index < keys_num; key_index++) - { - key_map[keys_buf[key_index]] = key_index; - } - - const size_t lookups_num = _lookups->info()->dimension(0); - for (size_t i = 0; i < lookups_num; ++i) - { - const auto lookup_value = lookups_buf[i]; - const auto it = key_map.find(lookup_value); - if (it != key_map.end()) - { -#if defined(DEBUG) - if (it->second >= lookups_num) - ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); -#endif // defined(DEBUG) - lookup_indices_buf[i] = static_cast<int32_t>(it->second); - hits_buf[i] = static_cast<uint8_t>(1); - } - else - { - lookup_indices_buf[i] = -1; - hits_buf[i] = static_cast<uint8_t>(0); - } - } - - const_cast<ICLTensor *>(_lookups)->unmap(queue); - const_cast<ICLTensor *>(_keys)->unmap(queue); - _hits->unmap(queue); - _lookup_indices->unmap(queue); - - Window win = window.collapse(ICLKernel::window(), 2, 4); - - Window win_lookup; - win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, win); - add_4D_tensor_argument(idx, _output, win); - add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); - - enqueue(queue, *this, win); - } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp deleted file mode 100644 index 80d99dd3b..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLNegKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(), - output->info()->tensor_shape()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - return Status{}; -} - -} // namespace - -CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} - -void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); - - _input = input; - _output = output; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Create kernel - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); - - // Configure window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); -} - -void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } while (collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp deleted file mode 100644 index 12bbe910f..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - NormalizationLayerInfo norm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - - // Checks performed when output is configured - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, - NormalizationLayerInfo norm_info) -{ - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, *input->clone()); - - const unsigned int norm_size = norm_info.norm_size(); - bool is_in_map = norm_info.is_in_map(); - - const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0; - const BorderSize border_size = BorderSize(0, border_width); - - const unsigned int num_elems_processed_per_iteration = 4; - const unsigned int num_elems_read_per_iteration = - is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) - : num_elems_processed_per_iteration; - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside - // the kernel, avoiding padding - AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLNormalizationLayerExKernel::CLNormalizationLayerExKernel() - : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false) -{ -} - -BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; } - -void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output, - NormalizationLayerInfo norm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info)); - - _input = input; - _output = output; - - const unsigned int num_elems_processed_per_iteration = 4; - const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.add_option( - ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff()))); - build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta()))); - build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa()))); - build_opts.add_option( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size()))); - build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2)))); - build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D"); - - // Create kernel - std::string kernel_name = - _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map"; - _kernel = static_cast<cl::Kernel>( - CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Set config_id for enabling LWS tuning - _config_id = "normalization_layer_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string( - static_cast<std::underlying_type<NormType>::type>(norm_info.type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(norm_info.norm_size()); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - NormalizationLayerInfo norm_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); - - return Status{}; -} - -void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const int collapsed_dimension = _is_in_map ? Window::DimZ : 4; - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension); - Window slice = window_collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - } while (window_collapsed.slide_window_slice_3D(slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp deleted file mode 100644 index 241f8ae4d..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, - DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, - DataType::QASYMM8); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} - -void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info())); - - _input = input; - _alpha = alpha; - _output = output; - - // Create kernel - std::string kernel_name = "prelu"; - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - - if (is_data_type_quantized_asymmetric(input->info()->data_type())) - { - build_opts.emplace("-DOFF_IN1=" + - support::cpp11::to_string(input->info()->quantization_info().offset)); - build_opts.emplace("-DOFF_IN2=" + - support::cpp11::to_string(alpha->info()->quantization_info().offset)); - build_opts.emplace("-DOFF_OUT=" + - support::cpp11::to_string(output->info()->quantization_info().offset)); - build_opts.emplace("-DSCALE_IN1=" + - support::cpp11::to_string(input->info()->quantization_info().scale)); - build_opts.emplace("-DSCALE_IN2=" + - support::cpp11::to_string(alpha->info()->quantization_info().scale)); - build_opts.emplace("-DSCALE_OUT=" + - support::cpp11::to_string(output->info()->quantization_info().scale)); - kernel_name += "_qasymm8"; - } - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output->info(), out_shape); - - if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) - { - set_format_if_unknown(*output->info(), Format::F16); - } - else if (input->info()->data_type() == DataType::F32 || - alpha->info()->data_type() == DataType::F32) - { - set_format_if_unknown(*output->info(), Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); - - AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input->info()->tensor_shape(); - const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_input1); - add_3D_tensor_argument(idx, _alpha, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLPReLUKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp deleted file mode 100644 index 99b54c822..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info, - const ITensorInfo *pad_size_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 && - input_info->num_dimensions() <= 4, - "Pad kernel supports upto 4-D input tensor"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - input_info->num_dimensions() == output_info->num_dimensions(), - "output tensor should have same number of dimensions as input tensor"); - - if (input_info->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() != - output_info->quantization_info(), - "The input and output quantization info are different!"); - } - - return Status{}; -} - -} // namespace - -CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {} - -void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info())); - - _input = input; - _output = output; - _pad_size = pad_size; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3))); - build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0))); - build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1))); - build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2))); - if (input->info()->data_type() == DataType::QASYMM8) - { - build_opts.emplace("-DZERO_VALUE=" + - support::cpp11::to_string(input->info()->quantization_info().offset)); - } - else - { - build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); - } - - // Create kernel - _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - _pad_size->map(queue); - - // Padding values only for up, top, left and front are required based on the rank of tensor - int rank = _pad_size->info()->dimension(1); - - auto pad_batch_up = - (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0; - auto pad_height_top = - (rank >= 2) - ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1})) - : 0; - auto pad_width_left = (rank >= 1) - ? *reinterpret_cast<const int32_t *>( - _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1})) - : 0; - auto pad_depth_front = - (rank >= 3) - ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3})) - : 0; - - _pad_size->unmap(queue); - - // Pad_values which needs to be passed - const cl_int4 paddingValues = { - {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top), - static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}}; - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - _kernel.setArg<cl_int4>(idx++, paddingValues); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp deleted file mode 100644 index aa094761c..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -using namespace arm_compute; - -namespace -{ -TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm) -{ - TensorShape output_shape = input->tensor_shape(); - permute(output_shape, perm); - return output_shape; -} - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - - const TensorShape output_shape = - misc::shape_calculator::compute_permutation_output_shape(*input, perm); - - // Validate configured output - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - return Status{}; -} -} // namespace - -CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {} - -void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output, - const PermutationVector &perm) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm)); - - _input = input; - _output = output; - _perm = perm; - - const TensorShape output_shape = get_output_shape(input->info(), perm); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - - // Create kernel - std::set<std::string> build_opts; - - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); - - // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector - build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0])); - build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1])); - build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2])); - build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3])); - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); - - // The CLPermute doesn't need padding so update_window_and_padding() can be skipped - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm)); - - return Status{}; -} - -void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_out(slice_in); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_out.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp deleted file mode 100644 index b985aa737..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_UNUSED(overflow_policy); - ARM_COMPUTE_UNUSED(rounding_policy); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); - - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - output->data_type() == DataType::U8 && - (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), - "Output can only be U8 if both inputs are U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, - ITensorInfo *output) -{ - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output, out_shape); - - if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) - { - set_format_if_unknown(*output, Format::S16); - } - else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) - { - set_format_if_unknown(*output, Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2); - - AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output, float scale, - ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), - scale, overflow_policy, rounding_policy)); - - // Configure kernel window - auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - - _input1 = input1; - _input2 = input2; - _output = output; - - int scale_int = -1; - // Extract sign, exponent and mantissa - int exponent = 0; - float normalized_mantissa = std::frexp(scale, &exponent); - // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 - // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= - // 14 - // Moreover, it will be negative as we deal with 1/2^n - if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) - { - // Store the positive exponent. We know that we compute 1/2^n - // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 - scale_int = std::abs(exponent - 1); - } - - std::string data_type; - std::string compute_type; - // Check if it has float inputs and output - if (is_data_type_float(input1->info()->data_type()) || - is_data_type_float(input2->info()->data_type())) - { - scale_int = -1; - compute_type = (input1->info()->data_type() == DataType::F32 || - input2->info()->data_type() == DataType::F32) - ? "float" - : "half"; - data_type = "DATA_TYPE_FLOAT"; - } - else - { - if (input1->info()->data_type() == DataType::S16 || - input2->info()->data_type() == DataType::S16) - { - compute_type = "int"; - } - else - { - compute_type = "ushort"; - } - data_type = "DATA_TYPE_INT"; - } - - // Construct kernel name - std::string kernel_name = "pixelwise_div"; - kernel_name += (scale_int >= 0) ? "_int" : "_float"; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace( - (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) - ? "-DWRAP" - : "-DSATURATE"); - build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" - : "-DROUND=_rte"); - build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.emplace("-DDATA_TYPE_RES=" + compute_type); - build_opts.emplace("-D" + data_type); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Set scale argument - unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters - - if (scale_int >= 0) - { - _kernel.setArg(idx++, scale_int); - } - else - { - _kernel.setArg(idx++, scale); - } - - ICLKernel::configure_internal(win_config.second); -} - -Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, float scale, - ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), - input2->clone().get(), - output->clone().get()) - .first); - - return Status{}; -} - -void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLPixelWiseDivisionKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp deleted file mode 100644 index f581780e1..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; -namespace -{ -// NOTE This is necessary because it is not guaranteed that the axis positions of input and output -// are the same. -const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) -{ - TensorShape out_shape{input_shape}; - - out_shape.set(axis, 1); - - return out_shape; -} -} // namespace - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, - ReduceOperation op) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, - DataType::F32, DataType::S32); - if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, - "Not support QASYMM8, yet"); - } - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Inputs are not broadcast compatible"); - - const auto num_dimensions = input->tensor_shape().num_dimensions(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - axis >= 0 && axis < num_dimensions, - "axis must be greater than or equal to 0 and less than (input's rank)."); - - const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), - "output shape's size does not match axis"); - - return Status{}; -} -} // namespace - -CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} - -void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, - const uint32_t axis, ReduceOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - - _input = input; - _output = output; - _axis = axis; - - std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); - output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); - - // Construct kernel name - std::string kernel_name; - int op_code = 0; - if (op == ReduceOperation::MAX) - { - kernel_name = "reduce_min_max"; - op_code = 1; - } - else if (op == ReduceOperation::MIN) - { - kernel_name = "reduce_min_max"; - op_code = 2; - } - else if (op == ReduceOperation::SUM) - { - kernel_name = "reduce_sum_mean"; - op_code = 3; - } - else if (op == ReduceOperation::MEAN) - { - kernel_name = "reduce_sum_mean"; - op_code = 4; - } - else - throw std::runtime_error("Operation not supported, yet"); - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); - build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output_info, Steps()); - - Coordinates coord; - coord.set_num_dimensions(output_info->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t axis, ReduceOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); - - return Status{}; -} - -void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &shape_in = _input->info()->tensor_shape(); - - unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters - - _kernel.setArg<cl_int>(idx++, _axis); - _kernel.setArg<cl_int>(idx++, shape_in[_axis]); - - // Support dimensions up to 4 - Window slice_out = window.collapse(ICLKernel::window(), 2, 4); - - // Setup input slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - // Copy output's shape in order to use for recovering at end of this method - // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions - // of input and output are the same - const TensorShape shape_out = _output->info()->tensor_shape(); - _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); - - idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - - // Recover output's shape of output tensor - _output->info()->set_tensor_shape(shape_out); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp deleted file mode 100644 index 6b0697e89..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size, - const ITensorInfo *padding_size, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::F16, DataType::S32, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::F16, DataType::S32, - DataType::F32); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(), - "The number of dimensions of input should be equal to output"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(), - "The input and output layouts are different!"); - - // TODO Support other cases - if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2), - "Input Depth should be equal to Output Depth"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || - padding_size->dimension(1) != 2, - "Only 2-dimensional spatial block's size was wrong"); - } - else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0), - "Input Depth should be equal to Output Depth"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || - padding_size->dimension(1) != 2, - "Only 2-dimensional spatial block's size was wrong"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input"); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4, - "CLSpaceToBatchNDKernel supports dimensions up to 4"); - - if (input->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(), - "The input and output quantization info are different!"); - } - - return Status{}; -} - -} // namespace - -CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {} - -void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size, - const ICLTensor *padding_size, ICLTensor *output) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info())); - - _input = input; - _block_size = block_size; - _padding_size = padding_size; - _output = output; - - // Set kernel build options - // TODO Support other cases - std::string kernel_name = "space_to_batch_4d"; - std::set<std::string> build_opts; - Window win; - - if (input->info()->data_layout() == DataLayout::NCHW) - { - kernel_name += "_nchw"; - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1))); - build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0))); - - win = calculate_max_window(*output->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - } - else if (input->info()->data_layout() == DataLayout::NHWC) - { - kernel_name += "_nhwc"; - build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2))); - build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1))); - build_opts.emplace("-DVEC_SIZE=" + - support::cpp11::to_string(num_elems_processed_per_iteration)); - - win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - input_access.set_valid_region(win, output->info()->valid_region()); - - if (window_changed) - { - ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!"); - } - } - else - { - ARM_COMPUTE_ERROR("Unsupported layout"); - } - - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3))); - if (input->info()->data_type() == DataType::QASYMM8) - { - build_opts.emplace("-DZERO_VALUE=" + - support::cpp11::to_string(input->info()->quantization_info().offset)); - } - else - { - build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); - } - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - // Configure kernel window - ICLKernel::configure_internal(win); -} - -void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - -#if defined(DEBUG) - const_cast<ICLTensor *>(_block_size)->map(queue); - const_cast<ICLTensor *>(_padding_size)->map(queue); - - const size_t num_dimensions = _input->info()->num_dimensions(); - const size_t num_spacial_dimensions = _block_size->info()->dimension(0); - int32_t batch_size = _input->info()->dimension(num_dimensions - 1); - for (size_t i = 0; i < num_spacial_dimensions; ++i) - { - const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i})); - const int32_t padding_size_pre = - *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i})); - const int32_t padding_size_post = - *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i})); - - ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1"); - ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0, - "Padding size should be greater than or equal to 0"); - - if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW) - { - ARM_COMPUTE_ERROR_ON_MSG( - _output->info()->dimension(i) != - (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size, - "Dimension value of spatial block does not match output's dimension value"); - } - else - { - ARM_COMPUTE_ERROR_ON_MSG( - _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) != - (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) + - padding_size_pre + padding_size_post) / - block_size, - "Dimension value of spatial block does not match output's dimension value"); - } - - batch_size *= block_size; - } - ARM_COMPUTE_ERROR_ON_MSG( - _output->info()->dimension(num_dimensions - 1) != batch_size, - "Output batch size should be equal to input batch size * (multiplication of all block size)"); - - const_cast<ICLTensor *>(_block_size)->unmap(queue); - const_cast<ICLTensor *>(_padding_size)->unmap(queue); -#endif // defined(DEBUG) - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - // Set block size window - Window win_block = calculate_max_window(*_block_size->info(), Steps()); - - // Set padding size window - Window win_padding = calculate_max_window(*_padding_size->info(), Steps()); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - add_1D_tensor_argument(idx, _block_size, win_block); - add_2D_tensor_argument(idx, _padding_size, win_padding); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp deleted file mode 100644 index 5d6329edc..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const int32_t block_size) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, - DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, - "Block size should be greater than or equal to 1."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3), - "Input batch should be equal to Output batch"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - input->dimension(2) * block_size * block_size == output->dimension(2), - "Output depth should be equal to (input depth * block size *block size)"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) && - !(input->dimension(1) % block_size), - "Input height and width should be divisible by block size"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) && - (output->dimension(1) == (input->dimension(1) / block_size)), - "Output height and width should be equal to " - "input_height/blocksize and input_width/blocksize respectively"); - - return Status{}; -} - -} // namespace - -CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} - -void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, - const int32_t block_size) -{ - - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); - - _input = input; - _output = output; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); - build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_out(slice_in); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_out.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_in); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp deleted file mode 100644 index 260bc39f1..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) -{ - const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -CLSquaredDifferenceKernel::CLSquaredDifferenceKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, - ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Create kernel - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); - build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts)); - - const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); - - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - { - set_shape_if_empty(*output->info(), out_shape); - - if (input1->info()->data_type() == DataType::F16 && - input2->info()->data_type() == DataType::F16) - { - set_format_if_unknown(*output->info(), Format::F16); - } - else if (input1->info()->data_type() == DataType::F32 || - input2->info()->data_type() == DataType::F32) - { - set_format_if_unknown(*output->info(), Format::F32); - } - } - - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); - Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); - - AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); - - bool can_collapse = true; - if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; - - const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice); - - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - } while (collapsed.slide_window_slice_3D(slice)); -} - -BorderSize CLSquaredDifferenceKernel::border_size() const -{ - const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp deleted file mode 100644 index 48146a43a..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/TensorInfo.h" - -using namespace arm_compute; - -CLStridedSliceExKernel::CLStridedSliceExKernel() - : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), - _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0) -{ -} - -Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *begin, const ITensorInfo *end, - const ITensorInfo *strides, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(), - strides->tensor_shape()); - - return Status{}; -} - -// Return the index for the first element along that axis. This index will be a -// positive integer between [0, axisSize - 1] that can be used to index -// directly into the data. -inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride, - const TensorShape &inputShape, int32_t axis) -{ - // Begin with the specified index - int32_t start = begin; - - // beginMask override - if (beginMask & 1 << axis) - { - if (stride > 0) - { - // Forward iteration - use the first element. These values will get - // clamped below (Note: We could have set them to 0 and axisSize-1, but - // use lowest() and max() to maintain symmetry with StopForAxis()) - start = std::numeric_limits<int32_t>::lowest(); - } - else - { - // Backward iteration - use the last element. - start = std::numeric_limits<int32_t>::max(); - } - } - - // Handle negative indices - int32_t axisSize = inputShape[axis]; - if (start < 0) - { - start += axisSize; - } - - // Clamping - start = arm_compute::utility::clamp(start, 0, axisSize - 1); - - return start; -} - -// Return the "real" index for the end of iteration along that axis. This is an -// "end" in the traditional C sense, in that it points to one past the last -// element. ie. So if you were iterating through all elements of a 1D array of -// size 4, this function would return 4 as the stop, because it is one past the -// "real" indices of 0, 1, 2 & 3. -inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, - const TensorShape &inputShape, int32_t axis) -{ - // Begin with the specified index - int32_t stop = end; - - // endMask override - if (endMask & (1 << axis)) - { - if (stride > 0) - { - // Forward iteration - use the last element. These values will get - // clamped below - stop = std::numeric_limits<int32_t>::max(); - } - else - { - // Backward iteration - use the first element. - stop = std::numeric_limits<int32_t>::lowest(); - } - } - - // Handle negative indices - int32_t axisSize = inputShape[axis]; - if (stop < 0) - { - stop += axisSize; - } - - // Clamping - // Because the end index points one past the last element, we need slightly - // different clamping ranges depending on the direction. - if (stride > 0) - { - // Forward iteration - stop = arm_compute::utility::clamp(stop, 0, axisSize); - } - else - { - // Backward iteration - stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); - } - - return stop; -} - -inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) -{ - int32_t ret = 0; - if (stride > 0) - { - ret = ((stop - start - 1) / stride) + 1; - } - else - { - ret = ((stop - start + 1) / stride) + 1; - } - ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number"); - return ret; -} - -void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output, - ICLTensor *beginData, ICLTensor *endData, - ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), - endData->info(), stridesData->info(), beginMask, endMask, - shrinkAxisMask)); - - _input = input; - _output = output; - _beginData = beginData; - _endData = endData; - _stridesData = stridesData; - _beginMask = beginMask; - _endMask = endMask; - _shrinkAxisMask = shrinkAxisMask; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DELEMENT_DATA_TYPE=" + - get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts)); - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps()); - ICLKernel::configure_internal(win); -} - -void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - _beginData->map(queue); - _endData->map(queue); - _stridesData->map(queue); - - std::vector<int32_t> starts; - std::vector<int32_t> strides; - - for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n) - { - const TensorShape shape = _input->info()->tensor_shape(); - starts.emplace_back( - StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n], - reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n)); - - strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]); - } - - for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) - { - starts.emplace_back(0); - strides.emplace_back(1); - } - // TODO: Apply shrinkAxisMask - - _beginData->unmap(queue); - _stridesData->unmap(queue); - _endData->unmap(queue); - - unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters - const cl_int4 startsArg = {{ - static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]), - static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]), - }}; - _kernel.setArg<cl_int4>(idx++, startsArg); - - const cl_int4 stridesArg = {{ - static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]), - static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]), - }}; - _kernel.setArg<cl_int4>(idx++, stridesArg); - - Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup output slice - Window slice_in(slice_out); - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_in.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, slice_in); - add_4D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out); - } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp deleted file mode 100644 index 073c2f7bb..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp +++ /dev/null @@ -1,468 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" - -namespace arm_compute -{ -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {} - -void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, - cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n) -{ - ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr); - ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - _input = input; - _topk_values = topk_values; - _topk_indices = topk_indices; - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts)); - - unsigned int idx = 3 * num_arguments_per_1D_tensor(); - _kernel.setArg(idx++, *indices); - _kernel.setArg(idx++, *temp_stack); - _kernel.setArg<cl_int>(idx++, k); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, 1, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, window); - add_1D_tensor_argument(idx, _topk_values, window); - add_1D_tensor_argument(idx, _topk_indices, window); - - enqueue(queue, *this, window); -} - -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {} - -void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, - int n) -{ - ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); - ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - _input = input; - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts)); - - unsigned int idx = num_arguments_per_1D_tensor(); - _kernel.setArg(idx++, *in_key_buf); - _kernel.setArg(idx++, *in_ind_buf); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, window); - - enqueue(queue, *this, window); -} - -//////////////////////////////////////////////////////////////////////////////// -// This kernel makes a histogram of radix for each work item. -CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {} - -void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) -{ - ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts)); - - int loc_histo_size = radix * _ITEMS * sizeof(cl_int); - - unsigned int idx = 1; - _kernel.setArg(idx++, *hist_buf); - - idx = 3; - _kernel.setArg(idx++, loc_histo_size, nullptr); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - _kernel.setArg(0, *_in_key_buf); - _kernel.setArg<cl_int>(2, _pass); - - cl::NDRange lws = cl::NDRange(_ITEMS, 1); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLRadixSortScanHistogram::CLRadixSortScanHistogram() {} - -void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) -{ - ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); - - int temp_size = - std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); - - unsigned int idx = 0; - _kernel.setArg(idx++, *hist_buf); - _kernel.setArg(idx++, temp_size, nullptr); - _kernel.setArg(idx++, *glob_sum_buf); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {} - -void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, - int bits) -{ - ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); - - int temp_size = - std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); - - unsigned int idx = 0; - _kernel.setArg(idx++, *glob_sum_buf); - _kernel.setArg(idx++, temp_size, nullptr); - _kernel.setArg(idx++, *temp_buf); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - cl::NDRange lws = cl::NDRange(gws_x, 1); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {} - -void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) -{ - ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts)); - - unsigned int idx = 0; - _kernel.setArg(idx++, *hist_buf); - _kernel.setArg(idx++, *glob_sum_buf); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLRadixSortReorder::CLRadixSortReorder() - : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), - _out_ind_buf(nullptr) -{ -} - -void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) -{ - ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - unsigned int radix = 1 << bits; - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); - build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); - build_opts.emplace("-DPERMUT=1"); - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts)); - - unsigned int idx = 2; - _kernel.setArg(idx++, *hist_buf); - - idx = 6; - _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure_internal(win); -} - -void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); - cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); - - _kernel.setArg(0, *_in_key_buf); - _kernel.setArg(1, *_out_key_buf); - _kernel.setArg<cl_int>(3, _pass); - _kernel.setArg(4, *_in_ind_buf); - _kernel.setArg(5, *_out_ind_buf); - - enqueue(queue, *this, window, lws); -} - -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {} - -void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) -{ - ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts)); - - unsigned int idx = 1; - _kernel.setArg(idx++, *first_negative_idx_buf); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - _kernel.setArg(idx++, *_out_key_buf); - - enqueue(queue, *this, window); -} - -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() - : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr) -{ -} - -void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) -{ - ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); - ARM_COMPUTE_ERROR_ON(n == 0); - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts)); - - unsigned int idx = 4; - _kernel.setArg(idx++, *first_negative_idx_buf); - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - _kernel.setArg(idx++, *_in_key_buf); - _kernel.setArg(idx++, *_out_key_buf); - _kernel.setArg(idx++, *_in_ind_buf); - _kernel.setArg(idx++, *_out_ind_buf); - - enqueue(queue, *this, window); -} - -//////////////////////////////////////////////////////////////////////////////// -CLTopKV2Store::CLTopKV2Store() - : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) -{ -} - -void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) -{ - ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); - ARM_COMPUTE_ERROR_ON(k == 0); - ARM_COMPUTE_ERROR_ON(k > n); - - _values = values; - _indices = indices; - - // Set kernel build options - std::set<std::string> build_opts; - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts)); - - unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; - _kernel.setArg<cl_int>(idx++, n); - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, k, 1)); - ICLKernel::configure_internal(win); -} - -void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) -{ - _out_key_buf = out_key_buf; - _out_ind_buf = out_ind_buf; -} - -void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - unsigned int idx = 0; - add_1D_tensor_argument(idx, _values, window); - add_1D_tensor_argument(idx, _indices, window); - _kernel.setArg(idx++, *_out_key_buf); - _kernel.setArg(idx++, *_out_ind_buf); - - enqueue(queue, *this, window); -} - -} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp deleted file mode 100644 index 3b5782c25..000000000 --- a/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEMath.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, - const ITensorInfo *output, const NormalizationLayerInfo &norm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared); - - // Checks performed when output is configured - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, - ITensorInfo *input_squared, - ITensorInfo *output, - const NormalizationLayerInfo &norm_info) -{ - unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); - const unsigned int num_elems_read_per_iteration = - num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); - const unsigned int num_rows = - (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1; - const unsigned int border_width = - (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U); - BorderSize border_size = BorderSize(0, border_width); - bool window_changed = false; - - // Configure window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration, - num_rows); - AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0, - num_elems_read_per_iteration, num_rows); - - if (output->total_size() != 0) - { - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = - update_window_and_padding(win, input_access, input_squared_access, output_access); - output_access.set_valid_region(win, input->valid_region()); - } - else - { - window_changed = update_window_and_padding(win, input_access, input_squared_access); - } - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_pair(err, win); -} -} // namespace - -NENormalizationLayerExKernel::NENormalizationLayerExKernel() - : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), - _norm_info(NormType::IN_MAP_1D), _border_size() -{ -} - -BorderSize NENormalizationLayerExKernel::border_size() const { return _border_size; } - -void NENormalizationLayerExKernel::configure(const ITensor *input, const ITensor *input_squared, - ITensor *output, NormalizationLayerInfo norm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output); - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output->info(), *input->info()); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), input_squared->info(), output->info(), norm_info)); - - const unsigned int border_width = - (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U); - - _input = input; - _input_squared = input_squared; - _output = output; - _norm_info = norm_info; - _border_size = BorderSize(0, border_width); - - switch (_input->info()->data_type()) - { - case DataType::F32: - { - switch (norm_info.type()) - { - case NormType::IN_MAP_1D: - _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, false>; - break; - case NormType::IN_MAP_2D: - // Normalize over X and Y - _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, true>; - break; - case NormType::CROSS_MAP: - _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 2, false>; - break; - default: - break; - } - break; - } - case DataType::F16: - { - switch (norm_info.type()) - { - case NormType::IN_MAP_1D: - _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, false>; - break; - case NormType::IN_MAP_2D: - // Normalize over X and Y - _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, true>; - break; - case NormType::CROSS_MAP: - _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 2, false>; - break; - default: - break; - } - break; - } - default: - ARM_COMPUTE_ERROR("NOT SUPPORTED!"); - } - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), input_squared->info(), - output->info(), norm_info); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); -} - -template <DataType dt, unsigned int dim, bool do_2D_norm> -void NENormalizationLayerExKernel::normalize_float(const Window &window) -{ - Iterator input(_input, window); - Iterator input_squared(_input_squared, window); - Iterator output(_output, window); - - const int dim_y = 1; - const int radius = _norm_info.norm_size(); - const int total_size = _input->info()->dimension(dim) - 1; - const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim]; - // We account padding across X only and we iterate over rows - const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left); - const int max_right = (dim == 2) ? total_size : total_size + border_size().left; - const int min_top = 0; - const int max_bottom = _input->info()->dimension(dim_y) - 1; - - if (dt == DataType::F32) - { - const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff()); - const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta()); - const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa()); - - execute_window_loop( - window, - [&](const Coordinates &id) { - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int current_slice = id[dim]; - const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - const int first_slice = std::max(current_slice - radius, min_left); - const int last_slice = std::min(current_slice + radius, max_right); - - // Accumulate 2D In-Map values - float32x4_t accu = vdupq_n_f32(0.f); - for (int j = first_row; j <= last_row; j++) - { - // Compute row displacement - const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; - const uint8_t *const input_squared_ptr = - input_squared.ptr() + row - (current_slice * input_squared_stride); - for (int i = first_slice; i <= last_slice; ++i) - { - accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>( - input_squared_ptr + i * input_squared_stride))); - } - } - - // Normalize - const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec); - const float32x4_t normalized_pixel = vmulq_f32( - vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized)); - vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel); - }, - input, input_squared, output); - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if (dt == DataType::F16) - { - const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff()); - const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta()); - const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa()); - - execute_window_loop( - window, - [&](const Coordinates &id) { - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int current_slice = id[dim]; - const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - const int first_slice = std::max(current_slice - radius, min_left); - const int last_slice = std::min(current_slice + radius, max_right); - - // Accumulate 2D In-Map values - float16x8_t accu = vdupq_n_f16(0.f); - for (int j = first_row; j <= last_row; j++) - { - // Compute row displacement - const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; - const uint8_t *const input_squared_ptr = - input_squared.ptr() + row - (current_slice * input_squared_stride); - for (int i = first_slice; i <= last_slice; ++i) - { - accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>( - input_squared_ptr + i * input_squared_stride))); - } - } - - const float16x8_t norm_f16 = - vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16); - const float16x8_t normalized_pixel = vmulq_f16( - vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16)); - vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel); - }, - input, input_squared, output); - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - else - { - ARM_COMPUTE_ERROR("Not supported"); - } -} - -Status NENormalizationLayerExKernel::validate(const ITensorInfo *input, - const ITensorInfo *input_squared, - const ITensorInfo *output, - const NormalizationLayerInfo norm_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - input_squared->clone().get(), - output->clone().get(), norm_info) - .first); - - return Status{}; -} - -void NENormalizationLayerExKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - // Run function - (this->*_func)(window); -} diff --git a/libs/ARMComputeEx/src/core/UtilsEx.cpp b/libs/ARMComputeEx/src/core/UtilsEx.cpp deleted file mode 100644 index b63093bbb..000000000 --- a/libs/ARMComputeEx/src/core/UtilsEx.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/UtilsEx.h" - -#include <cstdint> -#include <fstream> -#include <map> -#include <string> - -using namespace arm_compute; - -const std::string & -arm_compute::string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act) -{ - static std::map<ActivationLayerInfoEx::ActivationFunction, const std::string> act_map = { - {ActivationLayerInfoEx::ActivationFunction::RSQRT, "RSQRT"}, - }; - - return act_map[act]; -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp deleted file mode 100644 index 1e52fc429..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLActivationLayerEx.h" - -#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h" - -using namespace arm_compute; - -void CLActivationLayerEx::configure(ICLTensor *input, ICLTensor *output, - ActivationLayerInfoEx act_info) -{ - auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerExKernel>(); - k->configure(input, output, act_info); - _kernel = std::move(k); -} - -Status CLActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfoEx &act_info) -{ - return CLActivationLayerExKernel::validate(input, output, act_info); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp deleted file mode 100644 index dff743e89..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLArgMinMax.h" - -#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -namespace arm_compute -{ - -CLArgMinMax::CLArgMinMax() - : _input(nullptr), _output(nullptr), _argminmax_axis(), _interm_tensors(), _argminmax_kernels(), - _num_of_kernels() -{ -} - -void CLArgMinMax::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, - ArgOperation op) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op)); - _input = input; - _output = output; - _argminmax_axis = axis; - _arg_op = op; - // NOTE The argminmax_axis must have no duplication. - _num_of_kernels = axis.size(); - const size_t num_of_interm_tensors = _num_of_kernels - 1; - - _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); - _argminmax_kernels = - arm_compute::support::cpp14::make_unique<CLArgMinMaxKernel[]>(_num_of_kernels); - - TensorShape shape{input->info()->tensor_shape()}; - for (size_t i = 0; i < num_of_interm_tensors; i++) - { - shape.set(_argminmax_axis[i], 1); - _interm_tensors[i].allocator()->init( - TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); - _interm_tensors[i].allocator()->allocate(); - } - - // Set a vector that is ordered ICLTensors sequentially. - std::vector<ICLTensor *> tensors; - tensors.emplace_back(input); - for (size_t i = 0; i < num_of_interm_tensors; i++) - { - tensors.emplace_back(_interm_tensors.get() + i); - } - tensors.emplace_back(output); - - // Apply ArgMinMax on all kernels - for (size_t i = 0; i < _num_of_kernels; i++) - { - _argminmax_kernels[i].configure(tensors[i], tensors[i + 1], _argminmax_axis[i], op); - } -} - -Status CLArgMinMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis, - const ITensorInfo *output, ArgOperation op) -{ - const size_t num_of_kernels = argminmax_axis.size(); - const size_t num_of_interm_tensors = num_of_kernels - 1; - - // Create temporary tensor infos - auto interm_tensors = - arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); - - // Create intermediate tensor info - TensorShape shape{input->tensor_shape()}; - - for (size_t i = 0; i < num_of_interm_tensors; i++) - { - shape.set(argminmax_axis[i], 1); - interm_tensors[i].set_data_type(input->data_type()); - interm_tensors[i].set_tensor_shape(shape); - interm_tensors[i].set_num_channels(input->num_channels()); - } - - // Set a vector that is ordered ITensorInfo sequentially. - std::vector<const ITensorInfo *> tensors; - tensors.emplace_back(input); - for (size_t i = 0; i < num_of_interm_tensors; i++) - { - tensors.emplace_back(interm_tensors.get() + i); - } - tensors.emplace_back(output); - - // Validate argminmax only on all kernels - for (size_t i = 0; i < num_of_kernels; i++) - { - ARM_COMPUTE_RETURN_ON_ERROR( - CLArgMinMaxKernel::validate(tensors[i], tensors[i + 1], argminmax_axis[i], op)); - } - - return Status{}; -} - -void CLArgMinMax::run() -{ - for (size_t i = 0; i < _num_of_kernels; ++i) - { - CLScheduler::get().enqueue(_argminmax_kernels[i]); - } -} - -} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp deleted file mode 100644 index 3f403c80a..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h" - -using namespace arm_compute; - -void CLArithmeticSubtractionEx::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, - ConvertPolicy policy) -{ - auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionExKernel>(); - k->configure(input1, input2, output, policy); - _kernel = std::move(k); - - if (output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - - if (broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } -} - -Status CLArithmeticSubtractionEx::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, ConvertPolicy policy) -{ - return CLArithmeticSubtractionExKernel::validate(input1, input2, output, policy); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp deleted file mode 100644 index 26e3798cc..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLBatchToSpaceND.h" - -#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h" - -using namespace arm_compute; - -void CLBatchToSpaceND::configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size) -{ - auto k = arm_compute::support::cpp14::make_unique<CLBatchToSpaceNDKernel>(); - k->configure(input, output, block_size); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp deleted file mode 100644 index 7c5fe5eda..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h" - -#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, - BinaryLogicalOperation op) -{ - auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); - k->configure(input1, input2, output, op); - _kernel = std::move(k); - - if (output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if (broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp deleted file mode 100644 index 8e106737c..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLCast.h" - -#include "arm_compute/core/CL/kernels/CLCastKernel.h" - -using namespace arm_compute; - -void CLCast::configure(ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp deleted file mode 100644 index f6a745a25..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLComparisonOp.h" - -#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -void CLComparisonOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, - const ComparisonOperation &op) -{ - auto k = arm_compute::support::cpp14::make_unique<CLComparisonOpKernel>(); - k->configure(input1, input2, output, op); - _kernel = std::move(k); - - if (output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - - if (broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp deleted file mode 100644 index c2e4ca9ff..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h" - -#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" - -using namespace arm_compute; - -void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) -{ - auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>(); - k->configure(input, output, block_size); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp deleted file mode 100644 index 2781784ca..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" - -#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" - -using namespace arm_compute; - -void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, - const ICLTensor *lookups) -{ - auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>(); - k->configure(input, output, lookups); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp deleted file mode 100644 index 411fa8700..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLExp.h" - -#include "arm_compute/core/CL/kernels/CLExpKernel.h" - -using namespace arm_compute; - -void CLExp::configure(const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLExpKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp deleted file mode 100644 index fb056fe45..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLGather.h" - -#include "arm_compute/core/CL/kernels/CLGatherKernel.h" - -using namespace arm_compute; - -void CLGather::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>(); - k->configure(input1, input2, output); - _kernel = std::move(k); -} - -Status CLGather::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output) -{ - return CLGatherKernel::validate(input1, input2, output); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp deleted file mode 100644 index 7180e9356..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h" - -#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" - -using namespace arm_compute; - -void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, - const ICLTensor *input, ICLTensor *output, ICLTensor *hits) -{ - auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>(); - k->configure(lookups, keys, input, output, hits); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp deleted file mode 100644 index be35ea732..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLNeg.h" - -#include "arm_compute/core/CL/kernels/CLNegKernel.h" - -using namespace arm_compute; - -void CLNeg::configure(ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp deleted file mode 100644 index 276c4557a..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLNormalizationLayerEx::CLNormalizationLayerEx() : _norm_kernel(), _border_handler() {} - -void CLNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, - const NormalizationLayerInfo &norm_info) -{ - ARM_COMPUTE_ERROR_ON(input == nullptr); - - // Configure normalization kernel - _norm_kernel.configure(input, output, norm_info); - - // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0)); -} - -Status CLNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, - const NormalizationLayerInfo &norm_info) -{ - return CLNormalizationLayerExKernel::validate(input, output, norm_info); -} - -void CLNormalizationLayerEx::run() -{ - // Run border handler - CLScheduler::get().enqueue(_border_handler, false); - - // Run normalization kernel - CLScheduler::get().enqueue(_norm_kernel); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp deleted file mode 100644 index 38adedd10..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLPReLU.h" - -#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>(); - k->configure(input, alpha, output); - _kernel = std::move(k); - - if (output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha; - - if (broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp deleted file mode 100644 index 5265b6c34..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* -* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved -* Copyright (c) 2016-2018 ARM Limited. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h" - -#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" - -using namespace arm_compute; - -void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size) -{ - auto k = arm_compute::support::cpp14::make_unique<CLPadLayerKernel>(); - k->configure(input, output, pad_size); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp deleted file mode 100644 index fb363270d..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLPermuteEx.h" - -#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h" - -using namespace arm_compute; - -void CLPermuteEx::configure(const ICLTensor *input, ICLTensor *output, - const PermutationVector &perm) -{ - auto k = arm_compute::support::cpp14::make_unique<CLPermuteExKernel>(); - k->configure(input, output, perm); - _kernel = std::move(k); -} - -Status CLPermuteEx::validate(const ITensorInfo *input, const ITensorInfo *output, - const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteExKernel::validate(input, output, perm)); - return Status{}; -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp deleted file mode 100644 index dc0baa8dd..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" - -using namespace arm_compute; - -void CLPixelWiseDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, - float scale, ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseDivisionKernel>(); - k->configure(input1, input2, output, scale, overflow_policy, rounding_policy); - _kernel = std::move(k); - - if (output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - - if (broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } -} - -Status CLPixelWiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) -{ - return CLPixelWiseDivisionKernel::validate(input1, input2, output, scale, overflow_policy, - rounding_policy); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp deleted file mode 100644 index 2b8d82706..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLReduceOperation.h" - -#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLReduceOperation::CLReduceOperation() - : _input(nullptr), _output(nullptr), _axis(), _interm_tensors(), _reduce_kernels() -{ -} - -Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, - const std::set<uint32_t> &axis, const ReduceOperation &op) -{ - const size_t num_of_kernels = axis.size(); - const size_t num_of_interm_tensors = num_of_kernels - 1; - - // Create temporary tensor infos - auto interm_tensors = - arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); - - // Create intermediate tensor info - TensorShape shape{input->tensor_shape()}; - - auto it = axis.begin(); - for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) - { - shape.set(*it, 1); - interm_tensors[i].set_data_type(input->data_type()); - interm_tensors[i].set_tensor_shape(shape); - interm_tensors[i].set_num_channels(input->num_channels()); - } - - // Set a vector that is ordered ITensorInfo sequentially. - std::vector<const ITensorInfo *> tensors; - tensors.emplace_back(input); - for (size_t i = 0; i < num_of_interm_tensors; ++i) - { - tensors.emplace_back(interm_tensors.get() + i); - } - tensors.emplace_back(output); - - // Validate ReduceOperation only on all kernels - it = axis.begin(); - for (size_t i = 0; i < num_of_kernels; ++i, ++it) - { - ARM_COMPUTE_RETURN_ON_ERROR( - CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); - } - - return Status{}; -} - -void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, - const std::set<uint32_t> &axis, ReduceOperation op) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op)); - - _axis = axis; - - _input = input; - _output = output; - - // NOTE The axis must have no duplication. - const size_t num_of_kernels = axis.size(); - const size_t num_of_interm_tensors = num_of_kernels - 1; - - _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); - _reduce_kernels = - arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); - - TensorShape shape{input->info()->tensor_shape()}; - auto it = axis.begin(); - for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) - { - shape.set(*it, 1); - _interm_tensors[i].allocator()->init( - TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); - _interm_tensors[i].allocator()->allocate(); - } - - // Set a vector that is ordered ICLTensors sequentially. - std::vector<ICLTensor *> tensors; - tensors.emplace_back(input); - for (size_t i = 0; i < num_of_interm_tensors; ++i) - { - tensors.emplace_back(_interm_tensors.get() + i); - } - tensors.emplace_back(output); - - // Apply ReduceOperation on all kernels - it = axis.begin(); - for (size_t i = 0; i < num_of_kernels; ++i, ++it) - { - _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op); - } -} - -void CLReduceOperation::run() -{ - const size_t num_of_kernels = _axis.size(); - for (size_t i = 0; i < num_of_kernels; ++i) - { - CLScheduler::get().enqueue(_reduce_kernels[i]); - } -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp deleted file mode 100644 index c03826891..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h" - -#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" - -using namespace arm_compute; - -void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size, - const ICLTensor *padding_size, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>(); - k->configure(input, block_size, padding_size, output); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp deleted file mode 100644 index 0f455f96f..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h" - -#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" - -using namespace arm_compute; - -void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) -{ - auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>(); - k->configure(input, output, block_size); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp deleted file mode 100644 index dc6e4af44..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLSquaredDifference.h" - -#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h" -#include "arm_compute/core/CL/ICLTensor.h" - -using namespace arm_compute; - -void CLSquaredDifference::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLSquaredDifferenceKernel>(); - k->configure(input1, input2, output); - _kernel = std::move(k); - - if (output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - - if (broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp deleted file mode 100644 index be7353493..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLStridedSliceEx.h" - -#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" - -using namespace arm_compute; - -void CLStridedSliceEx::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) -{ - auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceExKernel>(); - k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask); - _kernel = std::move(k); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp deleted file mode 100644 index 19177497c..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLTopKV2.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include "arm_compute/core/CL/ICLTensor.h" - -#include "../../topk_v2.h" - -namespace arm_compute -{ - -CLTopKV2::CLTopKV2() - : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), - _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), - _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), - _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), - _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr), _qs_kernel(), - _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), - _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), - _reorder_negatives_kernel(), _store_kernel() -{ -} - -void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, - int total_bits, int bits) -{ - _total_bits = total_bits; - _bits = bits; - _n = input->info()->tensor_shape()[0]; - - // _total_bits should be divided by _bits. - ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0); - - _k = k; - _radix = 1 << bits; - - _input = input; - _values = values; - _indices = indices; - - std::string topk_env; - - char *env = getenv("ACL_TOPKV2"); - if (env) - topk_env = env; - - if (topk_env == "GPU_SINGLE") - { - _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), - CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); - _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), - CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); - - _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n); - } - else if (topk_env == "GPU") - { - // n should be divided by (_GROUPS * _ITEMS) - ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0); - - _hist_buf_size = _radix * _GROUPS * _ITEMS; - _glob_sum_buf_size = _HISTOSPLIT; - - _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, - sizeof(cl_int) * _hist_buf_size); - _glob_sum_buf = - cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, - sizeof(cl_int) * _glob_sum_buf_size); - _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, - sizeof(cl_int) * _glob_sum_buf_size); - _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), - CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int)); - _in_key_buf = cl::Buffer(CLScheduler::get().context(), - CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); - _out_key_buf = cl::Buffer(CLScheduler::get().context(), - CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); - _in_ind_buf = cl::Buffer(CLScheduler::get().context(), - CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); - _out_ind_buf = cl::Buffer(CLScheduler::get().context(), - CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); - - _p_in_key_buf = &_in_key_buf; - _p_out_key_buf = &_out_key_buf; - _p_in_ind_buf = &_in_ind_buf; - _p_out_ind_buf = &_out_ind_buf; - - _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n); - _hist_kernel.configure(&_hist_buf, bits, _n); - _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); - _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits); - _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); - _reorder_kernel.configure(&_hist_buf, bits, _n); - _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n); - _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n); - _store_kernel.configure(values, indices, k, _n); - } - else - { - // DO NOTHING for CPU. - } -} - -void CLTopKV2::run() -{ - std::string topk_env; - - char *env = getenv("ACL_TOPKV2"); - if (env) - topk_env = env; - - if (topk_env == "GPU_SINGLE") - { - run_on_gpu_single_quicksort(); - } - else if (topk_env == "GPU") - { - run_on_gpu(); - } - else - { - run_on_cpu(); - } -} - -void CLTopKV2::run_on_gpu_single_quicksort() -{ - // This is a single threaded quick sort implementation. - CLScheduler::get().enqueue(_qs_kernel, false); - - arm_compute::CLScheduler::get().sync(); -} - -void CLTopKV2::run_on_gpu() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - // 1. CLTopKV2Init set key buffer and index buffer. - // - Key buffer is set as the same value of the layer's input - // - Values in the index buffer are set as their indices. - CLScheduler::get().enqueue(_init_kernel, false); - - int n_passes = _total_bits / _bits; - - // 2. Repeat (total_bits/bits) times. - // - total_bits is the number of bits of the data type (e.g., 32 for float) - // - bits defines number of buckets (e.g. 16 buckets where bit is 4) - for (int pass = 0; pass < n_passes; ++pass) - { - arm_compute::CLScheduler::get().sync(); - - // 2.1. Calculate histogram with _GROUPS * _ITEMS threads - _hist_kernel.setPass(pass, _p_in_key_buf); - CLScheduler::get().enqueue(_hist_kernel, false); - - // 2.2. Calculate prefix sum locally with multiple threads - CLScheduler::get().enqueue(_scan_hist_kernel, false); - // 2.3. Calculate prefix sum within a work group - CLScheduler::get().enqueue(_glob_scan_hist_kernel, false); - // 2.4. Calculate global prefix sum - CLScheduler::get().enqueue(_paste_hist_kernel, false); - - // 2.5. Reorder keys and indices based on the global prefix sum - _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf); - CLScheduler::get().enqueue(_reorder_kernel, false); - - cl::Buffer *tmp; - // swap key buffers - tmp = _p_in_key_buf; - _p_in_key_buf = _p_out_key_buf; - _p_out_key_buf = tmp; - - // swap index buffers - tmp = _p_in_ind_buf; - _p_in_ind_buf = _p_out_ind_buf; - _p_out_ind_buf = tmp; - } - - // 3. Get the first negative index - // Because we swap in_buf and out_buf at the end of the above for loop, - // the output buffers are in bufs. - _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf); - CLScheduler::get().enqueue(_find_first_negative_kernel, false); - - // 4. Correct odering of negatives - // - Since radix sort does not consider negatives, negatives are considered as bigger values - // than positives. - // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf - _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, - _p_out_ind_buf); - CLScheduler::get().enqueue(_reorder_negatives_kernel, false); - - // 5. Extract top k values from sorted keys and indices. - _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf); - CLScheduler::get().enqueue(_store_kernel, false); - - arm_compute::CLScheduler::get().sync(); - -#if 0 - // below code is left for debugging. - int first_neg; - q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg); - std::cout << "first neg = " << first_neg << std::endl; - - float in_key[_n]; - q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key); - for(uint32_t i = 0 ; i < _n; ++i) { - std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl; - } - - float out_key[_n]; - q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key); - for(uint32_t i = 0 ; i < _n; ++i) { - std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl; - } - - int in_ind[_n]; - q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind); - for(uint32_t i = 0 ; i < _n; ++i) { - std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl; - } - - int out_ind[_n]; - q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind); - for(uint32_t i = 0 ; i < _n; ++i) { - std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl; - } - - int hist_buf[_hist_buf_size]; - q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf); - for(uint32_t i = 0 ; i < _hist_buf_size; ++i) { - std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl; - } - - int glob_sum_buf[_glob_sum_buf_size]; - q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf); - for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) { - std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl; - } - -#endif -} - -void CLTopKV2::run_on_cpu() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - // const Window& w = _topkv2_kernel.window(); - - _input->map(q); - _values->map(q); - _indices->map(q); - - // int row_size = (w[0].end() - w[0].start()) / w[0].step(); - int row_size = _input->info()->tensor_shape()[0]; - int rank = _input->info()->num_dimensions(); - - if (rank > 2) - throw std::runtime_error("Not supported type."); - - int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1); - - if (_input->info()->data_type() == DataType::F32) - { - nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k, - (int32 *)_indices->buffer(), (float *)_values->buffer()); - } - else if (_input->info()->data_type() == DataType::S32) - { - nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k, - (int32 *)_indices->buffer(), - (int32_t *)_values->buffer()); - } - else if (_input->info()->data_type() == DataType::QASYMM8) - { - nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k, - (int32 *)_indices->buffer(), - (uint8_t *)_values->buffer()); - } - else - { - throw std::runtime_error("Not supported type."); - } - - _input->unmap(q); - _values->unmap(q); - _indices->unmap(q); -} -} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp deleted file mode 100644 index 988e92715..000000000 --- a/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NENormalizationLayerEx::NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), - _border_handler(), _input_squared() -{ -} - -void NENormalizationLayerEx::configure(const ITensor *input, ITensor *output, - const NormalizationLayerInfo &norm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), - input->info()->quantization_info()); - _input_squared.allocator()->init(tensor_info); - - // Manage intermediate buffers - _memory_group.manage(&_input_squared); - - // Configure kernels - _norm_kernel.configure(input, &_input_squared, output, norm_info); - _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO); - _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, - PixelValue(0.0f)); - - // Allocate the tensor once the configure methods have been called - _input_squared.allocator()->allocate(); -} - -Status NENormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, - const NormalizationLayerInfo &norm_info) -{ - // Perform validation step - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_RETURN_ON_ERROR( - NENormalizationLayerExKernel::validate(input, input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate( - input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - - return Status{}; -} - -void NENormalizationLayerEx::run() -{ - _memory_group.acquire(); - - NEScheduler::get().schedule(&_multiply_kernel, Window::DimY); - NEScheduler::get().schedule(&_border_handler, Window::DimY); - NEScheduler::get().schedule(&_norm_kernel, Window::DimY); - - _memory_group.release(); -} diff --git a/libs/ARMComputeEx/src/runtime/topk_v2.h b/libs/ARMComputeEx/src/runtime/topk_v2.h deleted file mode 100644 index f94effea1..000000000 --- a/libs/ARMComputeEx/src/runtime/topk_v2.h +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2018 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file topk_v2.h - * @brief This file contains TopK method and TopContainer class for TopK operation - * @ingroup COM_AI_RUNTIME - */ - -#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ -#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ - -typedef int32_t int32; - -namespace nnfw -{ -namespace rt -{ -namespace optimized_ops -{ -/** - * @brief class to define TopK operation - * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. - * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than - * TFLite. - * (TFLite additionaly supports kTfLiteInt64.) - * - * The class that collects top indexes of k values. Based on template - * tensorflow::gtl::TopN<> but, for optimization, - * it re-uses the same container. - */ -template <typename T> class TopContainer -{ -public: - /** - * @brief Prevent default constructor of of this class - */ - TopContainer() = delete; - /** - * @brief Constructor with params - * @param [in] row_size Size of row in data - * @param [in] k The top k predictions - */ - TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) - { - container_.reserve(std::min(k, row_size) + 1); - } - - /** - * @brief Prevent instances of this class from being copied (As this class contains pointers) - * @param [in] topContainer To copy - */ - TopContainer(const TopContainer &) = delete; - /* - * @brief Prevent instances of this class from being copied (As this class contains pointers) - * @param [in] topContainer To copy - * @return Reference of TopContainer - */ - TopContainer &operator=(const TopContainer &) = delete; - - /** - * @brief Start collecting - * @param [in] values To set as values - * @return N/A - */ - void start_collecting(const T *values) - { - values_ = values; - container_.clear(); - } - - /** - * @brief Push a value to be compared for topk - * @param [in] a A value to compare - * @return N/A - */ - void push(int32 a) - { - auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; - if (container_.size() <= (size_t)k_) - { - container_.push_back(a); - if (container_.size() == (size_t)(k_ + 1)) - { - std::make_heap(container_.begin(), container_.end(), comparator); - std::pop_heap(container_.begin(), container_.end(), comparator); - } - } - else if (comparator(a, container_.front())) - { - container_.back() = a; - std::push_heap(container_.begin(), container_.end(), comparator); - std::pop_heap(container_.begin(), container_.end(), comparator); - } - } - - /** - * @brief Get sorted result from pushed values - * @return Reference of vector with sorted values - */ - const std::vector<int32> &sorted_result() - { - auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; - if (container_.size() <= (size_t)(k_)) - { - std::sort(container_.begin(), container_.end(), comparator); - } - else - { - std::sort_heap(container_.begin(), container_.end() - 1, comparator); - container_.resize(k_); - } - return container_; - } - -private: - int32 k_; - std::vector<int32> container_; - const T *values_ = nullptr; - - bool compare_fun(int32 a, int32 b) const - { - if (values_[b] < values_[a]) - { - return true; - } - else if (values_[b] > values_[a]) - { - return false; - } - else - { - return a < b; - } - } -}; - -/** - * @brief Operates TopK operation with params - * @param [in] row_size Size of row in data - * @param [in] num_rows The number of rows in data - * @param [in] data To be operated in - * @param [in] k The top k predictions - * @param [out] output_indexes Indexes of targets in the top k predictions - * @param [out] output_values Values of targets in the top k predictions - * @return N/A - */ -template <typename T> -void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, - T *output_values) -{ - TopContainer<T> topc(k, row_size); - for (int row = 0; row < num_rows; ++row) - { - const T *values_row = data + row * row_size; - topc.start_collecting(values_row); - for (int32 c = 0; c < row_size; ++c) - { - topc.push(c); - } - - // Prepare output buffers. - int32 *indexes_row = output_indexes + row * k; - T *output_row = output_values + row * k; - // We always assume that the output is sorted. - const auto &top_k = topc.sorted_result(); - std::copy(top_k.begin(), top_k.end(), indexes_row); - std::transform(top_k.begin(), top_k.end(), output_row, - [values_row](const int32 loc) { return values_row[loc]; }); - } -} - -} // namespace optimized_ops -} // namespace rt -} // namespace nnfw - -#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ |