diff options
Diffstat (limited to 'libs/ARMComputeEx')
153 files changed, 10265 insertions, 2864 deletions
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h index 026487077..e4e752ef9 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -14,6 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLKernelLibraryEx.h + * @ingroup COM_AI_RUNTIME + * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines + * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL. + */ + #ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ #define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ @@ -27,58 +35,76 @@ namespace arm_compute { -/** CLKernelLibrary class */ +/** + * @brief Class to build OpenCL kernels added from nnfw + * */ class CLKernelLibraryEx { using StringSet = std::set<std::string>; private: - /** Default Constructor. */ + /** + * @brief Construct a new CLKernelLibraryEx object + */ CLKernelLibraryEx(); public: - /** Prevent instances of this class from being copied */ + /** + * @brief Prevent instances of this class from being copied. + */ CLKernelLibraryEx(const CLKernelLibraryEx &) = delete; - /** Prevent instances of this class from being copied */ + + /** + * @brief Prevent instances of this class from being copied. + */ const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete; - /** Access the KernelLibrary singleton. - * @return The KernelLibrary instance. + + /** + * @brief Get the KernelLibrary singleton. + * @return The KernelLibrary instance */ static CLKernelLibraryEx &get(); - /** Initialises the kernel library. - * - * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded. - * @param[in] context (Optional) CL context used to create programs. - * @param[in] device (Optional) CL device for which the programs are created. - */ - void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), - cl::Device device = cl::Device::getDefault()) + + /** + * @brief Initialise the kernel library. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @param[in] context CL context used to create programs. + * @param[in] device CL device for which the programs are created. + * @return N/A + */ + void init(std::string kernel_path, cl::Context context, cl::Device device) { _kernel_path = std::move(kernel_path); _context = std::move(context); _device = std::move(device); } - /** Sets the path that the kernels reside in. - * - * @param[in] kernel_path Path of the kernel. + + /** + * @brief Set the path that the kernels reside in. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @return N/A */ void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; }; - /** Gets the path that the kernels reside in. + + /** + * @brief Get the path that the kernels reside in. + * @return the path of kernel files */ std::string get_kernel_path() { return _kernel_path; }; - /** Gets the source of the selected program. - * + + /** + * @brief Get the source of the selected program. * @param[in] program_name Program name. - * * @return Source of the selected program. */ std::string get_program_source(const std::string &program_name); - /** Sets the CL context used to create programs. - * + + /** + * @brief Set the CL context used to create programs. * @note Setting the context also resets the device to the * first one available in the new context. - * * @param[in] context A CL context. + * @return N/A */ void set_context(cl::Context context) { @@ -102,42 +128,56 @@ public: } } - /** Accessor for the associated CL context. - * + /** + * @brief Return associated CL context. * @return A CL context. */ cl::Context &context() { return _context; } - /** Sets the CL device for which the programs are created. - * + /** + * @brief Set the CL device for which the programs are created. * @param[in] device A CL device. + * @return N/A */ void set_device(cl::Device device) { _device = std::move(device); } - /** Return the device version - * + /** + * @brief Gets the CL device for which the programs are created. + * @return A CL device. + */ + cl::Device &get_device() { return _device; } + + /** + * @brief Return the device version * @return The content of CL_DEVICE_VERSION */ std::string get_device_version(); - /** Creates a kernel from the kernel library. - * + + /** + * @brief Create a kernel from the kernel library. * @param[in] kernel_name Kernel name. * @param[in] build_options_set Kernel build options as a set. - * * @return The created kernel. */ Kernel create_kernel(const std::string &kernel_name, const StringSet &build_options_set = {}) const; - /** Find the maximum number of local work items in a workgroup can be supported for the kernel. - * + + /** + * @brief Find the maximum number of local work items in a workgroup can be supported for the + * kernel. + * @param[in] kernel kernel object */ + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; - /** Return the default NDRange for the device. - * + /** + * @brief Return the default NDRange for the device. + * @return default NDRangeof the device */ cl::NDRange default_ndrange() const; - /** Clear the library's cache of binary programs + /** + * @brief Clear the library's cache of binary programs + * @return N/A */ void clear_programs_cache() { @@ -145,29 +185,45 @@ public: _built_programs_map.clear(); } - /** Access the cache of built OpenCL programs */ + /** + * @brief Access the cache of built OpenCL programs + * @return program map data structure of which key is name of kernel and value is + * kerel source name. (*.cl) + */ const std::map<std::string, cl::Program> &get_built_programs() const { return _built_programs_map; } - /** Add a new built program to the cache - * + /** + * @brief Add a new built program to the cache * @param[in] built_program_name Name of the program * @param[in] program Built program to add to the cache + * @return N/A */ void add_built_program(const std::string &built_program_name, cl::Program program); + /** + * @brief Returns true if FP16 is supported by the CL device + * @return true if the CL device supports FP16 + */ + bool fp16_supported() const; + + /** + * @brief Returns true if int64_base_atomics extension is supported by the CL device + * @return true if the CL device supports int64_base_atomics extension + */ + bool int64_base_atomics_supported() const; + private: - /** Load program and its dependencies. - * + /** + * @brief Load program and its dependencies. * @param[in] program_name Name of the program to load. */ const Program &load_program(const std::string &program_name) const; - /** Concatenates contents of a set into a single string. - * + /** + * @brief Concatenates contents of a set into a single string. * @param[in] s Input set to concatenate. - * * @return Concatenated string. */ std::string stringify_set(const StringSet &s) const; diff --git a/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h new file mode 100644 index 000000000..dbda354d6 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_OPENCLEX_H__ +#define __ARM_COMPUTE_OPENCLEX_H__ + +#include <string> +#include <utility> + +/* Configure the Khronos C++ wrapper to target OpenCL 1.2: */ +#ifndef ARM_COMPUTE_NO_EXCEPTIONS +#define CL_HPP_ENABLE_EXCEPTIONS +#endif // ARM_COMPUTE_NO_EXCEPTIONS +#define CL_HPP_CL_1_2_DEFAULT_BUILD +#define CL_HPP_TARGET_OPENCL_VERSION 110 +#define CL_HPP_MINIMUM_OPENCL_VERSION 110 +#include <CL/cl2.hpp> + +namespace arm_compute +{ +/** Class for loading OpenCL symbols. */ +class CLSymbolsEx final +{ +private: + CLSymbolsEx() = default; + void load_symbols(void *handle); + +public: + /** Get the static instance of CLSymbols. + * + * @return The static instance of CLSymbols. + */ + static CLSymbolsEx &get(); + /** Load symbols from the given OpenCL library path. + * + * @param[in] library Path to the OpenCL library. + * + * @return True if loading the library is successful. + */ + bool load(const std::string &library); + /** Load symbols from any of the default OpenCL library names. + * + * @return True if loading any library is successful. + */ + bool load_default(); + +#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr + + DECLARE_FUNCTION_PTR(clGetEventInfo); + DECLARE_FUNCTION_PTR(clSetEventCallback); + +#undef DECLARE_FUNCTION_PTR + +private: + std::pair<bool, bool> _loaded{false, false}; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_OPENCLEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h new file mode 100644 index 000000000..080cc47ef --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ +#define __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the activation layer kernel. */ +class CLActivationLayerExKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLActivationLayerExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLActivationLayerExKernel(const CLActivationLayerExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLActivationLayerExKernel &operator=(const CLActivationLayerExKernel &) = delete; + /** Allow instances of this class to be moved */ + CLActivationLayerExKernel(CLActivationLayerExKernel &&) = default; + /** Allow instances of this class to be moved */ + CLActivationLayerExKernel &operator=(CLActivationLayerExKernel &&) = default; + /** Default destructor */ + ~CLActivationLayerExKernel() = default; + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will + * store the result + * of the activation function. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + */ + void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLActivationLayerKernel + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor + * will store the result + * of the activation function. Data types supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; + bool _run_in_place; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h new file mode 100644 index 000000000..b91a26159 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLArgMinMaxKernel.h + * @brief This file defines CLArgMinMaxKernel + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__ +#define __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the argminmax max kernel. + */ +class CLArgMinMaxKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor. + */ + CLArgMinMaxKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied + */ + CLArgMinMaxKernel(const CLArgMinMaxKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied + * @return Reference of this instance + */ + CLArgMinMaxKernel &operator=(const CLArgMinMaxKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved + */ + CLArgMinMaxKernel(CLArgMinMaxKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved + * @return Reference of this instance + */ + CLArgMinMaxKernel &operator=(CLArgMinMaxKernel &&) = default; + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] argminmax_axis Axis to argminmax + * return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t argminmax_axis, + ArgOperation op); + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLArgMinMaxKernel + * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] argminmax_axis Axis to argminmax + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t argminmax_axis, ArgOperation op); + + /* + * @brief Run CLArgMinMaxKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + /* + * @brief Run CLArgMinMaxKernel op on CPU + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run_on_cpu(cl::CommandQueue &queue); + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _argminmax_axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLargminmaxMAXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h new file mode 100644 index 000000000..9a765f310 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ +#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the arithmetic subtraction kernel (support broadcasting) + * + * Arithmetic subtraction is computed by: + * @f[ output(x,y) = input1(x,y) - input2(x,y) @f] + */ +class CLArithmeticSubtractionExKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLArithmeticSubtractionExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticSubtractionExKernel(const CLArithmeticSubtractionExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticSubtractionExKernel &operator=(const CLArithmeticSubtractionExKernel &) = delete; + /** Allow instances of this class to be moved */ + CLArithmeticSubtractionExKernel(CLArithmeticSubtractionExKernel &&) = default; + /** Allow instances of this class to be moved */ + CLArithmeticSubtractionExKernel &operator=(CLArithmeticSubtractionExKernel &&) = default; + /** Default destructor */ + ~CLArithmeticSubtractionExKernel() = default; + + /** Initialise the kernel's inputs, output and convertion policy. + * + * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32. + * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), + * S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArithmeticSubtractionExKernel + * + * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), + * S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; /**< Source tensor 1 */ + const ICLTensor *_input2; /**< Source tensor 2 */ + ICLTensor *_output; /**< Destination tensor */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h new file mode 100644 index 000000000..1387897c9 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__ +#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform BATCH_TO_SPACE_ND operation */ +class CLBatchToSpaceNDKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLBatchToSpaceNDKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLBatchToSpaceNDKernel(const CLBatchToSpaceNDKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLBatchToSpaceNDKernel &operator=(const CLBatchToSpaceNDKernel &) = delete; + /** Allow instances of this class to be moved */ + CLBatchToSpaceNDKernel(CLBatchToSpaceNDKernel &&) = default; + /** Allow instances of this class to be moved */ + CLBatchToSpaceNDKernel &operator=(CLBatchToSpaceNDKernel &&) = default; + /** Default destructor */ + ~CLBatchToSpaceNDKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t *block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h new file mode 100644 index 000000000..ab33d9d3a --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/ +class CLBinaryLogicalOpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLBinaryLogicalOpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h index 6bd33bf8f..4c2feb903 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h @@ -14,6 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLCastKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLCastKernel class + */ + #ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ #define __ARM_COMPUTE_CLCASTKERNEL_H__ @@ -23,30 +30,62 @@ namespace arm_compute { class ICLTensor; -/** OpenCL kernel to perform a cast operation */ +/** + * @brief Class to define OpenCL kernel for cast operation + */ class CLCastKernel : public ICLKernel { public: - /** Default constructor */ + /** + * @brief Construct CLCastKernel object + */ CLCastKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ CLCastKernel(const CLCastKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ CLCastKernel &operator=(const CLCastKernel &) = delete; - /** Allow instances of this class to be moved */ + + /** + * @brief Construct CLCastKernel object using default move constructor + * @param[in] CLCastKernel object to move + */ CLCastKernel(CLCastKernel &&) = default; - /** Allow instances of this class to be moved */ + + /** + * @brief Allow instances of this class to be moved + * @param[in] CLCastKernel object to move + */ CLCastKernel &operator=(CLCastKernel &&) = default; - /** Default destructor */ + + /** + * @brief Destruct this CLCastKernel object + */ ~CLCastKernel() = default; - /** Initialise the kernel's input and output. - * + + /** + * @brief Initialise the kernel's input and output. * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @return N/A */ void configure(const ICLTensor *input, ICLTensor *output); - // Inherited methods overridden: + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h new file mode 100644 index 000000000..f5f455993 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ +#define __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to check if values in both tensors are equal*/ +class CLComparisonOpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLComparisonOpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLComparisonOpKernel(const CLComparisonOpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLComparisonOpKernel &operator=(const CLComparisonOpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLComparisonOpKernel(CLComparisonOpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLComparisonOpKernel &operator=(CLComparisonOpKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + const ComparisonOperation &op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h new file mode 100644 index 000000000..60ec7a82a --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ +#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform depthTospace operation */ +class CLDepthToSpaceKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLDepthToSpaceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete; + /** Allow instances of this class to be moved */ + CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default; + /** Allow instances of this class to be moved */ + CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default; + /** Default destructor */ + ~CLDepthToSpaceKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h new file mode 100644 index 000000000..da075db69 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLEmbeddingLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLEmbeddingLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform EmbeddingLookup operation with opencl kernel +*/ +class CLEmbeddingLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLEmbeddingLookupKernel object + * */ + CLEmbeddingLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLEmbeddingLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] input Source tensor. + * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] lookups Lookups are 1D tensor that values are indices into the first + * dimension of input. + * Data types supported: S32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLEmbeddingLookupKernel + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] lookups Lookups info. Data types supported: S32. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + const ICLTensor *_lookups; /** Lookups tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h new file mode 100644 index 000000000..a6ea539f8 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLEXPKERNEL_H__ +#define __ARM_COMPUTE_CLEXPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform an exponential operation */ +class CLExpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLExpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLExpKernel(const CLExpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLExpKernel &operator=(const CLExpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLExpKernel(CLExpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLExpKernel &operator=(CLExpKernel &&) = default; + /** Default destructor */ + ~CLExpKernel() = default; + /** Set the source, destination of the kernel + * + * @param[in] input Source tensor. Data type supported: F32. + * @param[out] output Destination tensor. Data type supported: F32. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLEXPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h index a51441aca..7e35a80b0 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h @@ -14,52 +14,85 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLGatherKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLGatherKernel class + */ + #ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__ #define __ARM_COMPUTE_CLGATHERKERNEL_H__ #include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" namespace arm_compute { class ICLTensor; -/** Interface for the gather kernel. - * +/** + * @brief Class to define an interface for the gather kernel. */ class CLGatherKernel : public ICLKernel { public: - /** Default constructor.*/ + /** + * @brief Construct CLGatherKernel object + * */ CLGatherKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ CLGatherKernel(const CLGatherKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ CLGatherKernel &operator=(const CLGatherKernel &) = delete; - /** Allow instances of this class to be moved */ + + /** + * @brief Construct CLGatherKernel object by using default move constructor + * @param[in] CLGatherKernel object to move + */ CLGatherKernel(CLGatherKernel &&) = default; - /** Allow instances of this class to be moved */ + + /** + * @brief Move assignment operator + * @param[in] CLGatherKernel object to move + */ CLGatherKernel &operator=(CLGatherKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * + + /** + * @brief Initialise the kernel's input, output and border mode. * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. * @param[in] input2 An input tensor. Data types supported: S32. * @param[out] output The output tensor, Data types supported: same as @p input1. + * @return N/A */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref * CLGatherKernel - * * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. * @param[in] input2 An input tensor. Data types supported: S32. * @param[out] output The output tensor, Data types supported: same as @p input1. - * * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output); - // Inherited methods overridden: + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h new file mode 100644 index 000000000..c3fc15637 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLHashtableLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLHashtableLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform HashtableLookup operation with opencl kernel +*/ +class CLHashtableLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLHashtableLookupKernel object + * */ + CLHashtableLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Construct a CLHashtableLookupKernel object by using default move constructor + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLHashtableLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, + ICLTensor *output, ICLTensor *hits); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLHashtableLookupKernel + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_lookups; /** Lookups tensor */ + const ICLTensor *_keys; /** Keys tensor */ + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + ICLTensor *_hits; /** Hits tensor */ + std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h new file mode 100644 index 000000000..ccbea147e --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ +#define __ARM_COMPUTE_CLNEGKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform a negation operation on tensor*/ +class CLNegKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLNegKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel(const CLNegKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel &operator=(const CLNegKernel &) = delete; + /** Allow instances of this class to be moved */ + CLNegKernel(CLNegKernel &&) = default; + /** Allow instances of this class to be moved */ + CLNegKernel &operator=(CLNegKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h new file mode 100644 index 000000000..181a6226a --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ +#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the normalization layer kernel. + */ +class CLNormalizationLayerExKernel : public ICLKernel +{ +public: + /** Constructor */ + CLNormalizationLayerExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLNormalizationLayerExKernel(const CLNormalizationLayerExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLNormalizationLayerExKernel &operator=(const CLNormalizationLayerExKernel &) = delete; + /** Default Move Constructor. */ + CLNormalizationLayerExKernel(CLNormalizationLayerExKernel &&) = default; + /** Default move assignment operator */ + CLNormalizationLayerExKernel &operator=(CLNormalizationLayerExKernel &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: + * F16/F32. + * @param[out] output Destination tensor. Output will have the same number of dimensions as + * input. Data types supported: same as @p input. + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + */ + void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLNormalizationLayerKernel + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: + * F16/F32. + * @param[in] output Destination tensor. Output will have the same number of dimensions as + * input. Data types supported: same as @p input. + * @param[in] norm_info Normalization layer information like the normalization type, normalization + * size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + NormalizationLayerInfo norm_info); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + BorderSize _border_size; + bool _is_in_map; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h new file mode 100644 index 000000000..eff1b8bd5 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__ +#define __ARM_COMPUTE_CLPRELU_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to calculate PReLU*/ +class CLPReLUKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLPReLUKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPReLUKernel(const CLPReLUKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPReLUKernel &operator=(const CLPReLUKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPReLUKernel(CLPReLUKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPReLUKernel &operator=(CLPReLUKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor1. + * @param[in] alpha Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input; + const ICLTensor *_alpha; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h new file mode 100644 index 000000000..cbaa2adee --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +* Copyright (c) 2016-2018 ARM Limited. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#ifndef __ARM_COMPUTE_CLPADLAYERKERNEL_H__ +#define __ARM_COMPUTE_CLPADLAYERKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform PAD operation */ +class CLPadLayerKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLPadLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerKernel(const CLPadLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPadLayerKernel(CLPadLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default; + /** Default destructor */ + ~CLPadLayerKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] pad_size Padding Size tensor. Data types supported : S32 + */ + void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ + ICLTensor *_pad_size; /**< Padding Size tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLPADLAYERKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h new file mode 100644 index 000000000..3434deee8 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ +#define __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform tensor permutation. + * + * Permutes given a permutation vector + */ +class CLPermuteExKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLPermuteExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPermuteExKernel(const CLPermuteExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPermuteExKernel &operator=(const CLPermuteExKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPermuteExKernel(CLPermuteExKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPermuteExKernel &operator=(CLPermuteExKernel &&) = default; + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to permute. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + */ + void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPermuteKernel + * + * @param[in] input First tensor input info. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: same as @p input. + * @param[in] perm Permutation vector + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + PermutationVector _perm; +}; +} // arm_compute +#endif /*__ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h index cd2b255bc..d579f5d8f 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h @@ -14,68 +14,106 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLPixelWiseDivisionKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLPixelWiseDivisionKernel class + */ + #ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ #define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ #include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" namespace arm_compute { class ICLTensor; -/** Interface for the pixelwise division kernel. - * +/** + * @brief Interface for the pixelwise division kernel. */ class CLPixelWiseDivisionKernel : public ICLKernel { public: - /** Default constructor.*/ + /** + * @brief Construct a CLPixelWiseDivisionKernel object + */ CLPixelWiseDivisionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete; - /** Allow instances of this class to be moved */ + + /** + * @brief Construct a CLPixelWiseDivisionKernel object by using move constructor + * @param[in] CLPixelWiseDivisionKernel object to move + */ CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default; - /** Allow instances of this class to be moved */ + + /** + * @brief Allow instances of this class to be moved + * @param[in] CLPixelWiseDivisionKernel object to move + */ CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32. * @param[in] input2 An input tensor. Data types supported: same as @p input1. * @param[out] output The output tensor, Data types supported: same as @p input1. Note: - * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * U8 requires both inputs to be U8. * @param[in] scale Scale to apply after division. * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest * even. + * @return N/A */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - /** Static function to check if given info will lead to a valid configuration of @ref + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref * CLPixelWiseDivisionKernel - * - * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32. * @param[in] input2 An input tensor info. Data types supported: same as @p input1. * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * Note: U8 requires both inputs to be U8. * @param[in] scale Scale to apply after division. * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - // Inherited methods overridden: + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; + + /** + * @brief The size of the border for that kernel + * @return The width in number of elements of the border. + */ BorderSize border_size() const override; private: diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h deleted file mode 100644 index a7d96cc5c..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ -#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the pixelwise division kernel. - * - */ -class CLReduceMaxKernel : public ICLKernel -{ -public: - /** Default constructor.*/ - CLReduceMaxKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLReduceMaxKernel(const CLReduceMaxKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete; - /** Allow instances of this class to be moved */ - CLReduceMaxKernel(CLReduceMaxKernel &&) = default; - /** Allow instances of this class to be moved */ - CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. - * @param[in] axis Axis to reduce - * @param[out] output The output tensor, Data types supported: same as @p input1. Note: - * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). - */ - void configure(const ICLTensor *input, int32_t axis, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLReduceMaxKernel - * - * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. - * @param[in] axis Axis to reduce - * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). - * - * @return a status - */ - static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - void run_on_cpu(cl::CommandQueue &queue); - -private: - const ICLTensor *_input; - ICLTensor *_output; - int32_t _axis; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h new file mode 100644 index 000000000..a26a4a7fc --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLReduceOperationKernel.h + * @brief This file defines CLReduceOperationKernel class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the reduce operation kernel + */ +class CLReduceOperationKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor + */ + CLReduceOperationKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel(const CLReduceOperationKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel(CLReduceOperationKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default; + /** + * @brief Default destructor + */ + ~CLReduceOperationKernel() = default; + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ReduceOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperationKernel. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32. + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op); + + /* + * @brief Run CLReduceOperationKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue CLQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h deleted file mode 100644 index de9df3381..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ -#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the reduction operation kernel */ -class CLReductionMeanKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLReductionMeanKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLReductionMeanKernel(const CLReductionMeanKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete; - /** Allow instances of this class to be moved */ - CLReductionMeanKernel(CLReductionMeanKernel &&) = default; - /** Allow instances of this class to be moved */ - CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default; - /** Default destructor */ - ~CLReductionMeanKernel() = default; - - /** Set the input and output tensors. - * - * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1 - */ - void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis); - - /** Static function to check if given info will lead to a valid configuration of @ref - * CLReductionMeanKernel. - * - * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. - * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p - * input. - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1 - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - std::vector<uint32_t> _reduction_axis; - BorderSize _border_size; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h new file mode 100644 index 000000000..68534f1ab --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ +#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */ +class CLSpaceToBatchNDKernel final : public ICLKernel +{ +public: + /** Default constructor */ + CLSpaceToBatchNDKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default; + /** Default destructor */ + ~CLSpaceToBatchNDKernel() = default; + /** Initialise the kernel's input and output. + * + * @note The data layout of input and output must be the same. + * @note The number of dimensions of input and output must be 4, and `spatial` dimensions + * are height and width. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + * @param[in] block_size Block size tensor. Data types supported: S32. + * @param[in] padding_size Padding size tensor. Data types supported: S32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + */ + void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, + ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + const ICLTensor *_block_size; /**< Block size tensor */ + const ICLTensor *_padding_size; /**< Padding size tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h new file mode 100644 index 000000000..be845a549 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ +#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform spaceTodepth operation */ +class CLSpaceToDepthKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLSpaceToDepthKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default; + /** Default destructor */ + ~CLSpaceToDepthKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h new file mode 100644 index 000000000..a4c44e35d --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ +#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to return squared difference value of two tensors (x-y)^2*/ +class CLSquaredDifferenceKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLSquaredDifferenceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLSquaredDifferenceKernel(const CLSquaredDifferenceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLSquaredDifferenceKernel &operator=(const CLSquaredDifferenceKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSquaredDifferenceKernel(CLSquaredDifferenceKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSquaredDifferenceKernel &operator=(CLSquaredDifferenceKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h index 248ae6635..6368c380e 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h @@ -14,36 +14,64 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ -#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ + +/** + * @file CLStridedSliceExKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLStridedSliceExKernel class + */ + +#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ +#define __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ #include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" namespace arm_compute { class ICLTensor; -/** Interface for the kernel to extract a strided slice of a tensor */ -class CLStridedSliceKernel : public ICLKernel +/** +* @brief Class to define an interface for the kernel to extract a strided slice of a tensor +*/ +class CLStridedSliceExKernel : public ICLKernel { public: - /** Default constructor */ - CLStridedSliceKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLStridedSliceKernel(const CLStridedSliceKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete; - /** Allow instances of this class to be moved */ - CLStridedSliceKernel(CLStridedSliceKernel &&) = default; - /** Allow instances of this class to be moved */ - CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default; - /** Default destructor */ - ~CLStridedSliceKernel() = default; - /** Set the input and output of the kernel - * + /** + * @brief Construct a CLStridedSliceExKernel object + * */ + CLStridedSliceExKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLStridedSliceExKernel(const CLStridedSliceExKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLStridedSliceExKernel &operator=(const CLStridedSliceExKernel &) = delete; + + /** + * @brief Construct a CLStridedSliceExKernel object by using default move constructor + * @param[in] CLStridedSliceExKernel object to move + * */ + CLStridedSliceExKernel(CLStridedSliceExKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLStridedSliceExKernel object to move + * */ + CLStridedSliceExKernel &operator=(CLStridedSliceExKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLStridedSliceExKernel() = default; + + /** + * @brief Set the input and output of the kernel * @param[in] input Source tensor. Data type supported: - * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] beginData The begin tensor. Data types supported: S32. * The number of dimensions must be 1. @@ -57,17 +85,17 @@ public: * @param[in] beginMask Mask for begin * @param[in] endMask Mask for end * @param[in] shrinkAxisMask Mask for shrink axis. - * + * @return N/A */ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLStridedSliceKernel - * + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLStridedSliceExKernel * @param[in] input The input tensor info. Data types supported: - * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 * @param[in] output The output tensor info, Data types supported: same as @p input1. * @param[in] begin The begin tensor info. Data types supported: S32. * The number of dimensions must be 1. @@ -81,7 +109,6 @@ public: * @param[in] beginMask Mask for begin * @param[in] endMask Mask for end * @param[in] shrinkAxisMask Mask for shrink axis. - * * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, @@ -89,7 +116,16 @@ public: const ITensorInfo *stride, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask); - // Inherited methods overridden: + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -103,4 +139,4 @@ private: int32_t _shrinkAxisMask; /** Shrink axis mask */ }; } // namespace arm_compute -#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */ +#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h index 5c567f38e..eb2bad254 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -14,14 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLTopKV2Kernel.h + * @brief This file defines classes for TopKV2Kernel + * @ingroup COM_AI_RUNTIME + */ + #ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ #define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ -#include "arm_compute/core/CL/ICLArray.h" #include "arm_compute/core/CL/ICLKernel.h" -#include <array> - // these parameters can be changed #define _ITEMS 16 // number of items in a group #define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS @@ -33,24 +37,59 @@ namespace arm_compute { class ICLTensor; +/** + * @brief Class to define CLTopKV2Single + */ class CLTopKV2Single : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2Single(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + */ CLTopKV2Single(const CLTopKV2Single &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + * @return Reference of this instance + */ CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + */ CLTopKV2Single(CLTopKV2Single &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + * @return Reference of this instance + */ CLTopKV2Single &operator=(CLTopKV2Single &&) = default; + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] topk_values Values of the top k predictions + * @param[in] topk_indices Indices of the top k predictions + * @param[in] indices Indices + * @param[in] temp_stack Temp stack + * @param[in] k K of the top k predictions + * @param[in] n Number times to quick-sort + * return N/A + */ void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n); - // Inherited methods overridden: + /* + * @brief Run CLTopKV2Single op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -59,52 +98,121 @@ private: ICLTensor *_topk_indices; }; +/** + * @brief Class to define CLTopKV2Init + */ class CLTopKV2Init : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2Init(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + */ CLTopKV2Init(const CLTopKV2Init &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + * @return Reference of this instance + */ CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + */ CLTopKV2Init(CLTopKV2Init &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + * @return Reference of this instance + */ CLTopKV2Init &operator=(CLTopKV2Init &&) = default; + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] in_key_buf Buffer of input key + * @param[in] in_ind_buf Buffer of input index + * @param[in] n Number times to quick-sort + * return N/A + */ void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n); - // Inherited methods overridden: + /* + * @brief Run CLTopKV2Init op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: ICLTensor *_input; }; +/** + * @brief Class to define CLRadixSortHistogram + */ class CLRadixSortHistogram : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortHistogram(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + */ CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + * @return Reference of this instance + */ CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + */ CLRadixSortHistogram(CLRadixSortHistogram &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + * @return Reference of this instance + */ CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ void configure(cl::Buffer *hist_buf, int bits, int n); + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * return N/A + */ void setPass(int pass, cl::Buffer *in_key_buf) { _pass = pass; _in_key_buf = in_key_buf; } - // Inherited methods overridden: + /* + * @brief Run CLRadixSortHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -112,82 +220,210 @@ private: cl::Buffer *_in_key_buf; }; +/** + * @brief Class to define CLRadixSortScanHistogram + */ class CLRadixSortScanHistogram : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortScanHistogram(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + */ CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + * @return Reference of this instance + */ CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + */ CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + * @return Reference of this instance + */ CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); - // Inherited methods overridden: + /* + * @brief Run CLRadixSortScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; }; +/** + * @brief Class to define CLRadixSortGlobalScanHistogram + */ class CLRadixSortGlobalScanHistogram : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortGlobalScanHistogram(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + */ CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + * @return Reference of this instance + */ CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + */ CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + * @return Reference of this instance + */ CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] glob_sum_buf Buffer of global sum + * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits); - // Inherited methods overridden: + /* + * @brief Run CLRadixSortGlobalScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; }; +/** + * @brief Class to define CLRadixSortPasteHistogram + */ class CLRadixSortPasteHistogram : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortPasteHistogram(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + */ CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + * @return Reference of this instance + */ CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + */ CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + * @return Reference of this instance + */ CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); - // Inherited methods overridden: + /* + * @brief Run CLRadixSortPasteHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; }; +/** + * @brief Class to define CLRadixSortReorder + */ class CLRadixSortReorder : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortReorder(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + */ CLRadixSortReorder(const CLRadixSortReorder &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + * @return Reference of this instance + */ CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + */ CLRadixSortReorder(CLRadixSortReorder &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + * @return Reference of this instance + */ CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ void configure(cl::Buffer *hist_buf, int bits, int n); + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, cl::Buffer *out_ind_buf) { @@ -197,7 +433,12 @@ public: _in_ind_buf = in_ind_buf; _out_ind_buf = out_ind_buf; } - // Inherited methods overridden: + /* + * @brief Run CLRadixSortReorder op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -208,47 +449,115 @@ private: cl::Buffer *_out_ind_buf; }; +/** + * @brief Class to define CLTopKV2FindFirstNegative + */ class CLTopKV2FindFirstNegative : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2FindFirstNegative(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + */ CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + * @return Reference of this instance + */ CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + */ CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + * @return Reference of this instance + */ CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ void configure(cl::Buffer *first_negative_idx_buf, int n); + /** + * @brief Set output buffer + * @param[out] out_key_buf Buffer of output key + * return N/A + */ void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; } - // Inherited methods overridden: + /* + * @brief Run CLTopKV2FindFirstNegative op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: cl::Buffer *_out_key_buf; }; +/** + * @brief Class to define CLTopKV2ReorderNegatives + */ class CLTopKV2ReorderNegatives : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2ReorderNegatives(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + */ CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + * @return Reference of this instance + */ CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + */ CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + * @return Reference of this instance + */ CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ void configure(cl::Buffer *first_negative_idx_buf, int n); + /** + * @brief Set buffers + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, cl::Buffer *out_ind_buf) { @@ -258,7 +567,12 @@ public: _out_ind_buf = out_ind_buf; } - // Inherited methods overridden: + /* + * @brief Run CLTopKV2ReorderNegatives op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -268,25 +582,63 @@ private: cl::Buffer *_out_ind_buf; }; +/** + * @brief Class to define CLTopKV2Store + */ class CLTopKV2Store : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2Store(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + */ CLTopKV2Store(const CLTopKV2Store &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + * @return Reference of this instance + */ CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + */ CLTopKV2Store(CLTopKV2Store &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + * @return Reference of this instance + */ CLTopKV2Store &operator=(CLTopKV2Store &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] values Values tensor to store + * @param[out] indices Indices tensor to be used for store + * @param[in] k K of the top k predictions + * @param[in] n Number times to store + * return N/A + */ void configure(ICLTensor *values, ICLTensor *indices, int k, int n); + /** + * @brief Set buffers + * @param[out] out_key_buf Buffer of output key + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); - // Inherited methods overridden: + /* + * @brief Run CLTopKV2Store op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: diff --git a/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h new file mode 100644 index 000000000..f7bf72985 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ +#define __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the normalization layer kernel. + */ +class NENormalizationLayerExKernel : public INEKernel +{ +public: + const char *name() const override { return "NENormalizationLayerKernel"; } + /** Default constructor */ + NENormalizationLayerExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENormalizationLayerExKernel(const NENormalizationLayerExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENormalizationLayerExKernel &operator=(const NENormalizationLayerExKernel &) = delete; + /** Default Move Constructor. */ + NENormalizationLayerExKernel(NENormalizationLayerExKernel &&) = default; + /** Default move assignment operator */ + NENormalizationLayerExKernel &operator=(NENormalizationLayerExKernel &&) = default; + /** Default destructor */ + ~NENormalizationLayerExKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types + * supported: FP16/F32. + * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a + * single input with dimensions [width, height, IFM], + * Data type supported: same as @p input + * @param[out] output Destination tensor. Output will have the same number of dimensions as + * input. Data type supported: same as @p input + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + */ + void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, + NormalizationLayerInfo norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * NENormalizationLayerKernel + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types + * supported: FP16/F32. + * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a + * single input with dimensions [width, height, IFM], + * Data type supported: same as @p input + * @param[in] output Destination tensor. Output will have the same number of dimensions as + * input. Data type supported: same as @p input + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, + const ITensorInfo *output, NormalizationLayerInfo norm_info); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; + +private: + /** Function to perform normalization depending on the given template + * dimension. The second template parameter specifies whether the + * normalization has to be 1D or 2D. + * + * @note Only supported normalizations are: + * - 1D over X or Z + * - 2D over X and Y + * + * @param[in] window Region on which to execute the kernel. + */ + template <DataType dt, unsigned int dim, bool do_2D_norm> + void normalize_float(const Window &window); + + /** Common signature for all the specialised normalization functions + * + * @param[in] window Region on which to execute the kernel. + */ + using NormalizationFunctionEx = void (NENormalizationLayerExKernel::*)(const Window &window); + +private: + NormalizationFunctionEx _func; + const ITensor *_input; + const ITensor *_input_squared; + ITensor *_output; + NormalizationLayerInfo _norm_info; + BorderSize _border_size; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/TypesEx.h b/libs/ARMComputeEx/arm_compute/core/TypesEx.h new file mode 100644 index 000000000..8381f1cc6 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/TypesEx.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_TYPESEX_H__ +#define __ARM_COMPUTE_TYPESEX_H__ + +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <string> +#include <utility> + +namespace arm_compute +{ + +/** Available ArgIndex operations **/ +enum class ArgOperation +{ + MAX, + MIN, +}; + +/** Available reduce operations */ +enum class ReduceOperation +{ + MAX, /**< Max */ + MEAN, /**< Mean */ + SUM, /**< Sum */ + MIN, /**< Min */ +}; + +/** Available binary logical operations */ +enum class BinaryLogicalOperation +{ + AND, /**< AND */ + OR, /**< OR */ +}; + +enum class ComparisonOperation +{ + EQUAL, /**< EQUAL */ + NOT_EQUAL, /**< NOT_EQUAL */ +}; + +/** Activation Layer Information class */ +class ActivationLayerInfoEx +{ +public: + /** Available activation functions */ + enum class ActivationFunction + { + RSQRT /**< Inverse Square root ( \f$ f(x) = \rsqrt{x} \f$ )*/ + }; + + ActivationLayerInfoEx() = default; + /** Default Constructor + * + * @param[in] f The activation function to use. + * @param[in] a (Optional) The alpha parameter used by some activation functions + * (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU, + * @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH). + * @param[in] b (Optional) The beta parameter used by some activation functions (@ref + * ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref + * ActivationFunction::TANH). + */ + ActivationLayerInfoEx(ActivationFunction f, float a = 0.0f, float b = 0.0f) + : _act(f), _a(a), _b(b), _enabled(true) + { + } + /** Get the type of activation function */ + ActivationFunction activation() const { return _act; } + /** Get the alpha value */ + float a() const { return _a; } + /** Get the beta value */ + float b() const { return _b; } + /** Check if initialised */ + bool enabled() const { return _enabled; } + +private: + ActivationFunction _act = {ActivationLayerInfoEx::ActivationFunction::RSQRT}; + float _a = {}; + float _b = {}; + bool _enabled = {false}; +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_TYPESEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/UtilsEx.h b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h new file mode 100644 index 000000000..8dd68a0c3 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_UTILSEX_H__ +#define __ARM_COMPUTE_UTILSEX_H__ + +#include "arm_compute/core/TypesEx.h" + +#include <cstdint> +#include <cstdlib> +#include <sstream> +#include <string> + +namespace arm_compute +{ +/** Translates a given activation function to a string. + * + * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string. + * + * @return The string describing the activation function. + */ +const std::string &string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act); +} +#endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h new file mode 100644 index 000000000..7e578550f --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLActivationLayerExKernel + * + * @note The function simulates an activation layer with the specified activation function. + */ +class CLActivationLayerEx : public ICLSimpleFunction +{ +public: + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr or is equal to the input, the activation function will + * be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will + * store the result + * of the activation function. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] act_info Activation layer parameters. + */ + void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLActivationLayer + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor + * will store the result + * of the activation function. Data types supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info); +}; +} +#endif /* __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h new file mode 100644 index 000000000..8044c58af --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLArgMinMax.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLArgMinMax class + */ + +#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_H__ +#define __ARM_COMPUTE_CLARG_MIN_MAX_H__ + +#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to execute CLArgMinMax operation + */ +class CLArgMinMax : public IFunction +{ +public: + /** + * @brief Construct a new CLArgMinMax object + */ + CLArgMinMax(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLArgMinMax(const CLArgMinMax &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLArgMinMax &operator=(const CLArgMinMax &) = delete; + + /** + * @brief Construct a new CLArgMinMax object by using copy constructor + * @param[in] CLArgMinMax object to move + */ + CLArgMinMax(CLArgMinMax &&) = default; + + /** + * @brief Assign a CLArgMinMax object. + * @param[in] CLArgMinMax object to assign. This object will be moved. + */ + CLArgMinMax &operator=(CLArgMinMax &&) = default; + + /** + * @brief Initialise the kernel's inputs and outputs. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The result of argminmaxMax operation. Data types supported: same as @p + * input. + * @param[in] axis Axis to argminmax. It must be sorted and no duplicates. + * @param[in] is_min True for ArgMin operation. + * @param[in] is_max Ture for ArgMax operation. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> argminmax_axis, + ArgOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] axis Axis to argminmax + * @param[out] output The result of argminmaxMax operation. Data types supported: same as @p + * input. + * @return a status + */ + static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis, + const ITensorInfo *output, ArgOperation op); + + /** + * @brief Run the kernels contained in the function + * This operation works on CPU on GPU depending on the value of argminmax_MAX_RUN_ON_CPU macro + * in CLArgMinMax.cpp. + * If argminmax_MAX_RUN_ON_CPU == 1, CPU runs this operation. + * Otherwise GPU runs this operation. + * @return N/A + */ + void run() override; + +private: + ICLTensor *_input; + ICLTensor *_output; + std::vector<uint32_t> _argminmax_axis; + ArgOperation _arg_op; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLArgMinMaxKernel[]> _argminmax_kernels{nullptr}; + size_t _num_of_kernels; +}; +} +#endif /*__ARM_COMPUTE_CLargminmax_MAX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h new file mode 100644 index 000000000..34e6c6334 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ +#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLArithmeticSubtractionExKernel + * + * @note The tensor data type for the inputs must be U8/S16/F16/F32. + * @note The function performs an arithmetic subtraction between two tensors. + */ +class CLArithmeticSubtractionEx : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and convertion policy. + * + * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified + * inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified + * inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), + * S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArithmeticSubtractionEx + * + * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), + * S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy); +}; +} +#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h new file mode 100644 index 000000000..d16a0762d --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ +#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLBatchToSpaceNDKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLBatchToSpaceND : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] block_size A pointer to an array of integer values specifying block sizes + * for spatial dimension. + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h new file mode 100644 index 000000000..061e34f26 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLBinaryLogicalOp : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8. + * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8. + * @param[out] output Output tensor. Data types supported: U8, QASYMM8. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h index 63050067d..56b8408e2 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h @@ -14,30 +14,35 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLCast.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLCast class + */ + #ifndef __ARM_COMPUTE_CLCAST_H__ #define __ARM_COMPUTE_CLCAST_H__ -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { class ICLTensor; -/** Basic function to run @ref CLCastKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. +/** + * @brief Class to run @ref CLCastKernel. + * This converts the input tensor to the tensor of the output tensor's type. */ class CLCast : public ICLSimpleFunction { public: - /** Initialise the kernel's input and output. - * - * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * The input tensor is [in, out] because its TensorInfo might be modified - * inside the kernel. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + /** + * @brief Initialise the kernel's input and output + * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. */ void configure(ICLTensor *input, ICLTensor *output); }; diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h new file mode 100644 index 000000000..1b0d70e7f --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_H__ +#define __ARM_COMPUTE_CLCOMPARISON_OP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLComparisonOp : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] input2 Source tensor2. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + const ComparisonOperation &op); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h new file mode 100644 index 000000000..d78a6ada4 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__ +#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLDepthToSpaceKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLDepthToSpace : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[block_size] block size integer only + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +}; +} // namesace arm_compute + +#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h new file mode 100644 index 000000000..257772a89 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class CLEmbeddingLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); +}; +} +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h new file mode 100644 index 000000000..2d0fc23a4 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLEXP_H__ +#define __ARM_COMPUTE_CLEXP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLExpKernel */ +class CLExp : public ICLSimpleFunction +{ +public: + /** Set the source, destination of the kernel + * + * @param[in] input Source tensor. Data type supported: F32. + * @param[out] output Destination tensor. Data type supported: F32. + */ + void configure(const ICLTensor *input, ICLTensor *output); +}; +} +#endif /* __ARM_COMPUTE_CLEXP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h index 3ae7afe14..f7fd3cda1 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h @@ -14,32 +14,43 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLGather.h + * @brief This file contains CLGather class + * @ingroup COM_AI_RUNTIME + */ + #ifndef __ARM_COMPUTE_CLGATHER_H__ #define __ARM_COMPUTE_CLGATHER_H__ -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { class ICLTensor; -/** Basic function to run @ref CLGatherKernel. */ +/** + * @brief Class to to run @ref CLGatherKernel. + */ class CLGather : public ICLSimpleFunction { public: - /** Initialise the kernel's inputs, output and convertion policy. - * - * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. - * @param[in] input2 An indexes tensor. Data types supported: S32. - * @param[out] output The output tensor, Data types supported: same as @p input1. - */ + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. + * @param[in] input2 An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @return N/A + */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref CLGather - * - * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. - * @param[in] input2 An indexes tensor. Data types supported: S32. - * @param[out] output The output tensor, Data types supported: same as @p input1. + + /** + * @brief Static function to check if given info will lead to a valid configuration + * of @ref CLGather + * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. + * @param[in] input2 An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h new file mode 100644 index 000000000..65aa6cbd5 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class CLHashtableLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, + ICLTensor *output, ICLTensor *hits); +}; +} +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h new file mode 100644 index 000000000..198a0fd4e --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNEG_H__ +#define __ARM_COMPUTE_CLNEG_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLNeg : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input Source tensor. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + * + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEG_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h new file mode 100644 index 000000000..4077245d5 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" +#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to compute a normalization layer. This function calls the following CL kernels: + * + * -# @ref CLFillBorderKernel + * -# @ref CLNormalizationLayerKernelEx + * + */ +class CLNormalizationLayerEx : public IFunction +{ +public: + /** Default constructor */ + CLNormalizationLayerEx(); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types + * supported: F16/F32 (Written to by the border handler) + * @param[out] output Destination tensor. Dimensions, data type and number of channels must + * match the input ones. + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + */ + void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLNormalizationLayer + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: + * F16/F32 + * @param[in] output Destination tensor. Dimensions, data type and number of channels must + * match the input ones. + * @param[in] norm_info Normalization layer information like the normalization type, normalization + * size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const NormalizationLayerInfo &norm_info); + + // Inherited methods overridden: + void run() override; + +private: + CLNormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel to run */ + CLFillBorderKernel _border_handler; /**< Kernel to handle borders */ +}; +} +#endif /* __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h new file mode 100644 index 000000000..622a61b5e --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPRELU_H__ +#define __ARM_COMPUTE_CLPRELU_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLPReLU : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input. Data types supported: + * QASYMM8/F16/F32. + * @param[in] alpha. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPRELU_H__*/ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h new file mode 100644 index 000000000..d6ea486d1 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h @@ -0,0 +1,47 @@ +/* +* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +* Copyright (c) 2016-2018 ARM Limited. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#ifndef __ARM_COMPUTE_CLPADLAYEREX_H__ +#define __ARM_COMPUTE_CLPADLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLPadLayerKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLPadLayerEx : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: + * U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: + * U8/QASYMM8/S16/S32/F16/F32. + * @param[in] pad_size Tensor for Padding values in NHWC format shape [n, 2], + * where n is the rank of tensor . Data types supported: S32 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLPADLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h new file mode 100644 index 000000000..9a0cc213c --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPERMUTEEX_H__ +#define __ARM_COMPUTE_CLPERMUTEEX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to execute an @ref CLPermuteKernel. */ +class CLPermuteEx : public ICLSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in] input The input tensor to permute. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + */ + void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration of @ref CLPermute. + * + * @param[in] input First tensor input info. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: same as @p input. + * @param[in] perm Permutation vector + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm); +}; +} +#endif /*__ARM_COMPUTE_CLPERMUTEEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h index c1383e21f..b142d3a2e 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h @@ -14,53 +14,61 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLPixelWiseDivision.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLPixelWiseDivision class + */ #ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ #define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { class ICLTensor; -/** Basic function to run @ref CLPixelWiseDivisionKernel. */ +/** + * @brief Class to run @ref CLPixelWiseDivisionKernel. + */ class CLPixelWiseDivision : public ICLSimpleFunction { public: - /** Initialise the kernel's inputs, output and convertion policy. - * - * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32 * The input tensor is [in, out] because its TensorInfo might be * modified inside the kernel in case of broadcasting of dimension 0. * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. * The input tensor is [in, out] because its TensorInfo might be * modified inside the kernel in case of broadcasting of dimension 0. * @param[out] output The output tensor, Data types supported: same as @p input1. - * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * Note: U8 requires both inputs to be U8. * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or - * 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest * even. + * @return N/A */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, ConvertPolicy overflow_policy = ConvertPolicy::WRAP, RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); - /** Static function to check if given info will lead to a valid configuration of @ref + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref * CLPixelWiseDivision - * - * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32 * @param[in] input2 An input tensor info. Data types supported: same as @p input1. * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * Note: U8 requires both inputs to be U8. * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h deleted file mode 100644 index 14b473f33..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__ -#define __ARM_COMPUTE_CLREDUCE_MAX_H__ - -#include "arm_compute/runtime/CL/CLArray.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to execute TopK operation. This function calls the following OpenCL kernels: - * - * -# @ref CLTopKV2Kernel - */ -class CLReduceMax : public IFunction -{ -public: - /** Constructor */ - CLReduceMax(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLReduceMax(const CLReduceMax &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLReduceMax &operator=(const CLReduceMax &) = delete; - /** Allow instances of this class to be moved */ - CLReduceMax(CLReduceMax &&) = default; - /** Allow instances of this class to be moved */ - CLReduceMax &operator=(CLReduceMax &&) = default; - /** Initialise the kernel's inputs and outputs. - * - * @note When locations of min and max occurrences are requested, the reported number of locations - * is limited to the given array size. - * - * @param[in] input Input image. Data types supported: F32 - * @param[in] axis Axis to reduce. Data type supported: S32 - * @param[out] output indices related to top k values. Data types supported: F32. - */ - void configure(ICLTensor *input, int32_t axis, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLPixelWiseDivision - * - * @param[in] input Input image. Data types supported: F32 - * @param[in] axis Axis to reduce. Data type supported: S32 - * @param[out] output indices related to top k values. Data types supported: F32. * - * - * @return a status - */ - static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output); - - // Inherited methods overridden: - void run() override; - -private: - void run_on_cpu(); - - int32_t _axis; - - ICLTensor *_input; - ICLTensor *_output; - - std::unique_ptr<ICLKernel> _kernel; -}; -} -#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h new file mode 100644 index 000000000..e1a6f6ab4 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLReduceOperation.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLReduceOperation class + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATION_H__ + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform ReduceOperation + */ +class CLReduceOperation : public IFunction +{ +public: + /** + * @brief Construct a new ReduceOperation object + */ + CLReduceOperation(); + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, + ReduceOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperation. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, const ReduceOperation &op); + + /** + * @brief Run the OpenCL kernel for this operation + * @return N/A + */ + void run() override; + +private: + ICLTensor *_input; + ICLTensor *_output; + std::set<uint32_t> _axis; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; +}; +} +#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h deleted file mode 100644 index 2081518c1..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__ -#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__ - -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" - -#include <cstdint> -#include <memory> -#include <vector> - -namespace arm_compute -{ -class ICLTensor; - -/** Perform reduction operation. - */ -class CLReductionMean : public IFunction -{ -public: - /** Default Constructor. - */ - CLReductionMean(); - - /** Set the input and output tensors. - * - * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1 - */ - void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis); - - /** Static function to check if given info will lead to a valid configuration of @ref - * CLReductionMean. - * - * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. - * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p - * input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1 - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis); - - // Inherited methods overridden: - void run() override; - -private: - CLReductionMeanKernel _reduction_mean_kernel; - CLFillBorderKernel _fill_border_kernel; -}; -} -#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h new file mode 100644 index 000000000..7e2df8986 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ +#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSpaceToBatchNDKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32. + * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape + * block_shape, and interleaves these blocks with the "batch" dimension such that in the output. + */ +class CLSpaceToBatchND : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @note The data layout of input and output must be the same. + * @note The number of dimensions of input and output must be 4, and `spatial` dimensions + * are height and width. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + * @param[in] block_size Tensor of integer values specifying block sizes for spatial + * dimension. + * Data types supported: S32 + * @param[in] padding_size Tensor of integer values specifying padding sizes for spatial + * dimension. + * Data types supported: S32 + * @param[out] output Output tensor. Data types supported: same as @p input. + * Data layout supported: NCHW/NHWC + */ + void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, + ICLTensor *output); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h new file mode 100644 index 000000000..17f762092 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__ +#define __ARM_COMPUTE_CLSPACETODEPTH_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSpaceToDepthKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLSpaceToDepth : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[block_size] block size integer only + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h new file mode 100644 index 000000000..3610ba71c --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__ +#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLSquaredDifference : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] input2 Source tensor2. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__*/ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h deleted file mode 100644 index f223a79be..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__ -#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__ - -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLStridedSliceKernel */ -class CLStridedSlice : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs and outputs - * - * @param[in] input First tensor input. Data type supported: - * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 - * @param[out] output Output tensor. Data type supported: Same as @p input - */ - void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask); -}; - -class CLStridedSliceCPU : public IFunction -{ -public: - /** Initialise inputs and outputs - * - * @param[in] input First tensor input. - * @param[out] output Output tensor. - */ - void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, - ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask); - - void run() override; - -private: - void run_on_cpu(); - - ICLTensor *_input; - ICLTensor *_output; - ICLTensor *_beginData; - ICLTensor *_endData; - ICLTensor *_stridesData; - int32_t _beginMask; - int32_t _endMask; - int32_t _shrinkAxisMask; -}; -} -#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h new file mode 100644 index 000000000..6b26a85c8 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLStridedSlice.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class + */ + +#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ +#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLStridedSliceKernel + */ +class CLStridedSliceEx : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs and outputs + * @param[in] input Tensor input. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Output tensor. Data type supported: Same as @p input + * @param[in] beginData 'begin' vector of strided slice operation + * @param[in] endData 'end' vector of strided slice operation + * @param[in] stridesData 'strides' vector of strided slice operation + * @param[in] beginMask If the ith bit is set, begin[i] is ignored + * @param[in] endMask If the ith bit is set, end[i] is ignored + * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the + * dimensionality by 1, taking on the value at index begin[i] + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask); +}; +} +#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h index 06cd1ee9b..5327e016f 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -14,51 +14,79 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLTopKV2.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLTopKV2 class + */ #ifndef __ARM_COMPUTE_CLTOPK_V2_H__ #define __ARM_COMPUTE_CLTOPK_V2_H__ #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" -#include "arm_compute/runtime/CL/CLArray.h" #include "arm_compute/runtime/IFunction.h" namespace arm_compute { class ICLTensor; -/** Basic function to execute TopK operation. This function calls the following OpenCL kernels: - * - * -# @ref CLTopKV2Kernel +/** + * @brief Class to execute TopKV2 operation. */ class CLTopKV2 : public IFunction { public: - /** Constructor */ + /** + * @brief Construct a new CLTopKV2 object + */ CLTopKV2(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ CLTopKV2(const CLTopKV2 &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ CLTopKV2 &operator=(const CLTopKV2 &) = delete; - /** Allow instances of this class to be moved */ + + /** + * @brief Construct a new CLTopKV2 object by using copy constructor + * @param[in] CLTopKV2 object to move + */ CLTopKV2(CLTopKV2 &&) = default; - /** Allow instances of this class to be moved */ + + /** + * @brief Assign a CLTopKV2 object. + * @param[in] CLTopKV2 object to assign. This object will be moved. + */ CLTopKV2 &operator=(CLTopKV2 &&) = default; - /** Initialise the kernel's inputs and outputs. - * - * @note When locations of min and max occurrences are requested, the reported number of locations - * is limited to the given array size. - * + + /** + * @brief Initialise the kernel's inputs and outputs. * @param[in] input Input image. Data types supported: U8/S16/F32. * @param[in] k The value of `k`. * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if * input type is F32. - * @param[out] indices indices related to top k values. Data types supported: S32 if input type + * @param[out] indices Indices related to top k values. Data types supported: S32 if input type * is U8/S16, F32 if input type is F32. + * @return N/A */ void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, int total_bits = 32, int bits = 4); - // Inherited methods overridden: + /** + * @brief Run the kernels contained in the function + * Depending on the value of the following environment variables it works differently: + * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE", + * quick sort on GPU is used. + * - If the value of environment variable "ACL_TOPKV2" == ""GPU"", + * radix sort on GPU is used. + * - For other value, TopKV2 runs on CPU + * @return N/A + */ void run() override; private: diff --git a/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h new file mode 100644 index 000000000..fa7408ecd --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h" +#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to compute a normalization layer. This function calls the following NEON kernels: + * + * -# @ref NEPixelWiseMultiplicationKernel + * -# @ref NEFillBorderKernel + * -# @ref NENormalizationLayerKernelEx + * + */ +class NENormalizationLayerEx : public IFunction +{ +public: + /** Default constructor */ + NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data type supported: + * F16/F32 + * @param[out] output Destination with the same dimensions, data type and number of channels of + * @p input + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + */ + void configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * NENormalizationLayer + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data type supported: + * F16/F32 + * @param[in] output Destination with the same dimensions, data type and number of channels of + * @p input + * @param[in] norm_info Normalization layer information like the normalization type, normalization + * size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const NormalizationLayerInfo &norm_info); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; /**< Function memory group */ + NENormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel */ + NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */ + NEFillBorderKernel _border_handler; /**< Kernel to handle borders */ + Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ +}; +} +#endif /* __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index d535c5da4..05ecdeb22 100644 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -38,255 +38,37 @@ using namespace arm_compute; const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { - {"absdiff", "absdiff.cl"}, - {"accumulate", "accumulate.cl"}, - {"accumulate_squared", "accumulate.cl"}, - {"accumulate_weighted", "accumulate.cl"}, - {"activation_layer", "activation_layer.cl"}, - {"activation_layer_qa8", "activation_layer_qa8.cl"}, - {"activation_layer_logistic_qa8", "activation_layer_qa8.cl"}, - {"arithmetic_add", "arithmetic_op.cl"}, - {"arithmetic_sub", "arithmetic_op.cl"}, + // ARMComputeEx kernels + {"activation_layer_ex", "activation_layer_ex.cl"}, + {"arg_op", "arg_operation.cl"}, + {"arithmetic_sub_ex", "arithmetic_op_ex.cl"}, {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"}, - {"batchnormalization_layer_nchw", "batchnormalization_layer.cl"}, - {"batchnormalization_layer_nhwc", "batchnormalization_layer.cl"}, - {"bitwise_or", "bitwise_op.cl"}, - {"bitwise_and", "bitwise_op.cl"}, - {"bitwise_xor", "bitwise_op.cl"}, - {"bitwise_not", "bitwise_op.cl"}, + {"batch_to_space_nd", "batch_to_space_nd.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, {"cast", "cast.cl"}, {"cast_qasymm_in", "cast.cl"}, {"cast_qasymm_out", "cast.cl"}, - {"channel_combine_NV", "channel_combine.cl"}, - {"channel_combine_RGB888", "channel_combine.cl"}, - {"channel_combine_RGBA8888", "channel_combine.cl"}, - {"channel_combine_UYVY422", "channel_combine.cl"}, - {"channel_combine_YUYV422", "channel_combine.cl"}, - {"channel_shuffle_nchw", "channel_shuffle.cl"}, - {"channel_extract_NV12", "channel_extract.cl"}, - {"channel_extract_NV21", "channel_extract.cl"}, - {"channel_extract_RGB888", "channel_extract.cl"}, - {"channel_extract_RGBA8888", "channel_extract.cl"}, - {"channel_extract_UYVY422", "channel_extract.cl"}, - {"channel_extract_YUYV422", "channel_extract.cl"}, - {"combine_gradients_L1", "canny.cl"}, - {"combine_gradients_L2", "canny.cl"}, - {"concatenate_depth", "concatenate.cl"}, - {"concatenate_width", "concatenate.cl"}, - {"convolution_rectangle", "convolution_rectangle.cl"}, - {"col2im", "col2im.cl"}, - {"convert_depth_down", "depth_convert.cl"}, - {"convert_depth_up", "depth_convert.cl"}, - {"convert_fc_weights", "convert_fc_weights.cl"}, - {"convolution3x3_static", "convolution3x3.cl"}, - {"convolution5x5_static", "convolution5x5.cl"}, - {"convolution7x7_static", "convolution7x7.cl"}, - {"convolution9x9_static", "convolution9x9.cl"}, - {"convolution_separable1x5_static", "convolution5x5.cl"}, - {"convolution_separable5x1_static", "convolution5x5.cl"}, - {"convolution_separable1x7_static", "convolution7x7.cl"}, - {"convolution_separable7x1_static", "convolution7x7.cl"}, - {"convolution_separable1x9_static", "convolution9x9.cl"}, - {"convolution_separable9x1_static", "convolution9x9.cl"}, - {"copy_tensor", "copy_tensor.cl"}, - {"copy_plane", "channel_extract.cl"}, - {"copy_planes_3p", "channel_combine.cl"}, - {"copy_to_keypoint", "fast_corners.cl"}, - {"deconvolution_upsample", "deconvolution_layer.cl"}, - {"depthwise_convolution_3x3", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_f16", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl"}, - {"depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl"}, - {"depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl"}, - {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl"}, - {"depthwise_im2col", "depthwise_convolution.cl"}, - {"depthwise_vector_to_tensor", "depthwise_convolution.cl"}, - {"depthwise_weights_reshape", "depthwise_convolution.cl"}, - {"dequantization_layer", "dequantization_layer.cl"}, - {"derivative", "derivative.cl"}, - {"dilate", "dilate.cl"}, - {"direct_convolution1x1", "direct_convolution1x1.cl"}, - {"direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl"}, - {"direct_convolution3x3", "direct_convolution3x3.cl"}, - {"direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl"}, - {"direct_convolution5x5", "direct_convolution5x5.cl"}, - {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"}, - {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"}, - {"erode", "erode.cl"}, - {"fast_corners", "fast_corners.cl"}, - {"fill_image_borders_constant", "fill_border.cl"}, - {"fill_image_borders_replicate", "fill_border.cl"}, - {"finalize", "optical_flow_pyramid_lk.cl"}, - {"floor_layer", "floor.cl"}, + {"comparison_op", "comparison_op.cl"}, + {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, + {"depth_to_space", "depth_to_space.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"exp_layer", "exp.cl"}, {"gather", "gather.cl"}, {"gather_1d", "gather.cl"}, {"gather_1d_out", "gather.cl"}, - {"gaussian1x5_sub_x", "gaussian_pyramid.cl"}, - {"gaussian5x1_sub_y", "gaussian_pyramid.cl"}, - {"gemm_accumulate_biases", "gemm.cl"}, - {"gemm_interleave4x4", "gemm.cl"}, - {"gemm_ma_f16", "gemm.cl"}, - {"gemm_ma_f32", "gemm.cl"}, - {"gemm_ma_qs8", "gemm.cl"}, - {"gemm_ma_qs16", "gemm.cl"}, - {"gemm_mv", "gemv.cl"}, - {"gemm_mv_quantized", "gemv.cl"}, - {"gemm_mm_interleaved_transposed_f16", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_f32", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_qs8", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_qs16", "gemm.cl"}, - {"gemm_mm_floating_point", "gemm.cl"}, - {"gemm_mm_floating_point_f16_bifrost", "gemm.cl"}, - {"gemm_mm_floating_point_f32_bifrost", "gemm.cl"}, - {"gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl"}, - {"gemm_mm_qs8", "gemm.cl"}, - {"gemm_mm_qs16", "gemm.cl"}, - {"gemm_lc_vm_f32", "gemm.cl"}, - {"gemm_transpose1xW", "gemm.cl"}, - {"gemmlowp_matrix_a_reduction", "gemmlowp.cl"}, - {"gemmlowp_matrix_b_reduction", "gemmlowp.cl"}, - {"gemmlowp_mm_bifrost", "gemmlowp.cl"}, - {"gemmlowp_mm_midgard", "gemmlowp.cl"}, - {"gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl"}, - {"gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl"}, - {"gemmlowp_offset_contribution", "gemmlowp.cl"}, - {"gemmlowp_output_stage_quantize_down", "gemmlowp.cl"}, - {"gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl"}, - {"harris_score_3x3", "harris_corners.cl"}, - {"harris_score_5x5", "harris_corners.cl"}, - {"harris_score_7x7", "harris_corners.cl"}, - {"hist_border_kernel", "histogram.cl"}, - {"hist_border_kernel_fixed", "histogram.cl"}, - {"hist_local_kernel", "histogram.cl"}, - {"hist_local_kernel_fixed", "histogram.cl"}, - {"hog_block_normalization", "hog.cl"}, - {"hog_detector", "hog.cl"}, - {"hog_orientation_binning", "hog.cl"}, - {"hysteresis", "canny.cl"}, - {"im2col1x1_stridex1_dchw", "im2col.cl"}, - {"im2col3x3_dchw", "im2col.cl"}, - {"im2col5x5_dchw", "im2col.cl"}, - {"im2col11x11_padx0_pady0_dchw", "im2col.cl"}, - {"im2col_generic_dchw", "im2col.cl"}, - {"im2col_generic_padx0_pady0_dchw", "im2col.cl"}, - {"im2col_reduced_dchw", "im2col.cl"}, - {"init_level", "optical_flow_pyramid_lk.cl"}, - {"init_level_max", "optical_flow_pyramid_lk.cl"}, - {"init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl"}, - {"integral_horizontal", "integral_image.cl"}, - {"integral_vertical", "integral_image.cl"}, - {"IYUV_to_NV12_bt709", "color_convert.cl"}, - {"IYUV_to_RGB888_bt709", "color_convert.cl"}, - {"IYUV_to_RGBA8888_bt709", "color_convert.cl"}, - {"IYUV_to_YUV444_bt709", "color_convert.cl"}, - {"l2_normalize", "l2_normalize.cl"}, - {"lktracker_stage0", "optical_flow_pyramid_lk.cl"}, - {"lktracker_stage1", "optical_flow_pyramid_lk.cl"}, - {"magnitude_phase", "magnitude_phase.cl"}, - {"mean_stddev_accumulate", "mean_stddev.cl"}, - {"minmax", "minmaxloc.cl"}, - {"minmax_border", "minmaxloc.cl"}, - {"minmax_layer", "minmax_layer.cl"}, - {"minmaxloc", "minmaxloc.cl"}, - {"non_linear_filter_box3x3", "non_linear_filter3x3.cl"}, - {"non_linear_filter_cross3x3", "non_linear_filter3x3.cl"}, - {"non_linear_filter_disk3x3", "non_linear_filter3x3.cl"}, - {"non_linear_filter_box5x5", "non_linear_filter5x5.cl"}, - {"non_linear_filter_cross5x5", "non_linear_filter5x5.cl"}, - {"non_linear_filter_disk5x5", "non_linear_filter5x5.cl"}, - {"non_max_suppression", "nonmax.cl"}, - {"normalization_layer_cross_map", "normalization_layer.cl"}, - {"normalization_layer_in_map", "normalization_layer.cl"}, - {"NV12_to_IYUV_bt709", "color_convert.cl"}, - {"NV12_to_RGB888_bt709", "color_convert.cl"}, - {"NV12_to_RGBA8888_bt709", "color_convert.cl"}, - {"NV12_to_YUV444_bt709", "color_convert.cl"}, - {"NV21_to_IYUV_bt709", "color_convert.cl"}, - {"NV21_to_RGB888_bt709", "color_convert.cl"}, - {"NV21_to_RGBA8888_bt709", "color_convert.cl"}, - {"NV21_to_YUV444_bt709", "color_convert.cl"}, - {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"}, - {"permute_201", "permute.cl"}, - {"permute_120", "permute.cl"}, - {"permute_3201", "permute.cl"}, - {"pixelwise_mul_float", "pixelwise_mul_float.cl"}, - {"pixelwise_mul_int", "pixelwise_mul_int.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"pad", "pad.cl"}, + {"permute_generic", "permute_ex.cl"}, {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, {"pixelwise_div_float", "pixelwise_div_float.cl"}, {"pixelwise_div_int", "pixelwise_div_int.cl"}, - {"pooling_layer_2", "pooling_layer.cl"}, - {"pooling_layer_3", "pooling_layer.cl"}, - {"pooling_layer_optimized_3", "pooling_layer.cl"}, - {"pooling_layer_7", "pooling_layer.cl"}, - {"pooling_layer_MxN_nchw", "pooling_layer.cl"}, - {"pooling_layer_MxN_nhwc", "pooling_layer.cl"}, - {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"}, - {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"}, - {"quantization_layer", "quantization_layer.cl"}, - {"reduce_max", "reduce_max.cl"}, - {"reduction_operation", "reduction_operation.cl"}, - {"reduction_mean", "reduction_mean.cl"}, - {"remap_nearest_neighbour", "remap.cl"}, - {"remap_bilinear", "remap.cl"}, - {"reshape_layer", "reshape_layer.cl"}, - {"reshape_to_columns", "convolution_layer.cl"}, - {"RGB888_to_IYUV_bt709", "color_convert.cl"}, - {"RGB888_to_NV12_bt709", "color_convert.cl"}, - {"RGB888_to_RGBA8888_bt709", "color_convert.cl"}, - {"RGB888_to_YUV444_bt709", "color_convert.cl"}, - {"RGBA8888_to_IYUV_bt709", "color_convert.cl"}, - {"RGBA8888_to_NV12_bt709", "color_convert.cl"}, - {"RGBA8888_to_RGB888_bt709", "color_convert.cl"}, - {"RGBA8888_to_YUV444_bt709", "color_convert.cl"}, - {"roi_pooling_layer", "roi_pooling_layer.cl"}, - {"scale_nearest_neighbour", "scale.cl"}, - {"scale_bilinear", "scale.cl"}, - {"scharr3x3", "scharr_filter.cl"}, - {"sobel3x3", "sobel_filter.cl"}, - {"sobel_separable5x1", "sobel_filter.cl"}, - {"sobel_separable1x5", "sobel_filter.cl"}, - {"sobel_separable7x1", "sobel_filter.cl"}, - {"sobel_separable1x7", "sobel_filter.cl"}, - {"softmax_layer_norm", "softmax_layer.cl"}, - {"softmax_layer_norm_quantized", "softmax_layer_quantized.cl"}, - {"softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl"}, - {"softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl"}, - {"softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl"}, - {"softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl"}, - {"strided_slice", "strided_slice.cl"}, - {"suppress_non_maximum", "canny.cl"}, - {"tablelookup_U8", "tablelookup.cl"}, - {"tablelookup_S16", "tablelookup.cl"}, - {"threshold_binary", "threshold.cl"}, - {"threshold_range", "threshold.cl"}, - {"transpose", "transpose.cl"}, - {"UYVY422_to_IYUV_bt709", "color_convert.cl"}, - {"UYVY422_to_NV12_bt709", "color_convert.cl"}, - {"UYVY422_to_RGB888_bt709", "color_convert.cl"}, - {"UYVY422_to_RGBA8888_bt709", "color_convert.cl"}, - {"warp_affine_nearest_neighbour", "warp_affine.cl"}, - {"warp_affine_bilinear", "warp_affine.cl"}, - {"warp_perspective_nearest_neighbour", "warp_perspective.cl"}, - {"warp_perspective_bilinear", "warp_perspective.cl"}, - {"winograd_filter_transform_2x2_3x3_nchw", "winograd.cl"}, - {"winograd_filter_transform_4x4_3x3_nchw", "winograd.cl"}, - {"winograd_filter_transform_4x4_5x5_nchw", "winograd.cl"}, - {"winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl"}, - {"winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl"}, - {"winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl"}, - {"winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl"}, - {"winograd_output_transform_2x2_3x3_nchw", "winograd.cl"}, - {"winograd_output_transform_4x4_3x3_nchw", "winograd.cl"}, - {"winograd_output_transform_4x4_5x5_nchw", "winograd.cl"}, - {"YUYV422_to_IYUV_bt709", "color_convert.cl"}, - {"YUYV422_to_NV12_bt709", "color_convert.cl"}, - {"YUYV422_to_RGB888_bt709", "color_convert.cl"}, - {"YUYV422_to_RGBA8888_bt709", "color_convert.cl"}, + {"prelu", "prelu.cl"}, + {"prelu_qasymm8", "prelu_quantized.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"squared_difference", "squared_difference.cl"}, + {"strided_slice_ex", "strided_slice_ex.cl"}, {"topkv2_init", "topkv2.cl"}, {"topkv2_find_first_negative", "topkv2.cl"}, {"topkv2_reorder_negatives", "topkv2.cl"}, @@ -296,23 +78,62 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, {"radixsort_reorder", "topkv2_radixsort.cl"}, {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"space_to_batch_4d_nchw", "space_to_batch.cl"}, + {"space_to_batch_4d_nhwc", "space_to_batch.cl"}, + {"space_to_depth", "space_to_depth.cl"}, }; const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { #ifdef EMBEDDED_KERNELS { + "activation_layer_ex.cl", +#include "./cl_kernels/activation_layer_ex.clembed" + }, + { + "arg_operation.cl", +#include "./cl_kernels/arg_operation.clembed" + }, + { + "arithmetic_op_ex.cl", +#include "./cl_kernels/arithmetic_op_ex.clembed" + }, + { + "batch_to_space_nd.cl", +#include "./cl_kernels/batch_to_space_nd.clembed" + }, + { "cast.cl", #include "./cl_kernels/cast.clembed" }, { - "fixed_point.h", -#include "./cl_kernels/fixed_point.hembed" + "comparison_op.cl", +#include "./cl_kernels/comparison_op.clembed" + }, + { + "comparison_op_quantized.cl", +#include "./cl_kernels/comparison_op_quantized.clembed" + }, + { + "embedding_lookup.cl", +#include "./cl_kernels/embedding_lookup.clembed" + }, + { + "depth_to_space.cl", +#include "./cl_kernels/depth_to_space.clembed" + }, + { + "exp.cl", +#include "./cl_kernels/exp.clembed" }, { "gather.cl", #include "./cl_kernels/gather.clembed" }, { + "hashtable_lookup.cl", +#include "./cl_kernels/hashtable_lookup.clembed" + }, + { "helpers.h", #include "./cl_kernels/helpers.hembed" }, @@ -321,6 +142,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map #include "./cl_kernels/helpers_asymm.hembed" }, { + "binary_logical_op.cl", +#include "./cl_kernels/binary_logical_op.clembed" + }, + { + "neg_tensor.cl", +#include "./cl_kernels/neg_tensor.clembed" + }, + { + "pad.cl", +#include "./cl_kernels/pad.clembed" + }, + { "pixelwise_div_float.cl", #include "./cl_kernels/pixelwise_div_float.clembed" }, @@ -329,16 +162,32 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map #include "./cl_kernels/pixelwise_div_int.clembed" }, { - "reduce_max.cl", -#include "./cl_kernels/reduce_max.clembed" + "prelu.cl", +#include "./cl_kernels/prelu.clembed" + }, + { + "prelu_quantized.cl", +#include "./cl_kernels/prelu_quantized.clembed" + }, + { + "reduce_operation.cl", +#include "./cl_kernels/reduce_operation.clembed" + }, + { + "space_to_batch.cl", +#include "./cl_kernels/space_to_batch.clembed" }, { - "reduction_mean.cl", -#include "./cl_kernels/reduction_mean.clembed" + "space_to_depth.cl", +#include "./cl_kernels/space_to_depth.clembed" }, { - "strided_slice.cl", -#include "./cl_kernels/strided_slice.clembed" + "squared_difference.cl", +#include "./cl_kernels/squared_difference.clembed" + }, + { + "strided_slice_ex.cl", +#include "./cl_kernels/strided_slice_ex.clembed" }, { "topkv2.cl", @@ -352,6 +201,11 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map "topkv2_quicksort.cl", #include "./cl_kernels/topkv2_quicksort.clembed" }, + { + "permute_ex.cl", +#include "./cl_kernels/permute_ex.clembed" + }, + #endif /* EMBEDDED_KERNELS */ }; @@ -359,7 +213,7 @@ CLKernelLibraryEx::CLKernelLibraryEx() : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the - // CLKernelLibrary is built + // CLKernelLibraryEx is built } CLKernelLibraryEx &CLKernelLibraryEx::get() @@ -380,7 +234,7 @@ Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name, } std::string concat_str; - if (fp16_supported(_device)) + if (fp16_supported()) { concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; } @@ -434,6 +288,13 @@ void CLKernelLibraryEx::add_built_program(const std::string &built_program_name, _built_programs_map.emplace(built_program_name, program); } +bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); } + +bool CLKernelLibraryEx::int64_base_atomics_supported() const +{ + return device_supports_extension(_device, "cl_khr_int64_base_atomics"); +} + const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const { const auto program_it = _programs_map.find(program_name); @@ -525,6 +386,7 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con cl::NDRange CLKernelLibraryEx::default_ndrange() const { + // GPUTarget _target = get_target_from_device(_device); cl::Device device = cl::Device::getDefault(); GPUTarget _target = get_target_from_device(device); cl::NDRange default_range; diff --git a/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp new file mode 100644 index 000000000..cbda169fb --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/OpenCLEx.h" + +#include <dlfcn.h> +#include <iostream> + +namespace arm_compute +{ +CLSymbolsEx &CLSymbolsEx::get() +{ + static CLSymbolsEx symbols; + return symbols; +} + +bool CLSymbolsEx::load_default() +{ + static const std::vector<std::string> libraries{"libOpenCL.so", "libGLES_mali.so", "libmali.so"}; + + if (_loaded.first) + { + return _loaded.second; + } + + // Indicate that default loading has been tried + _loaded.first = true; + + for (const auto &lib : libraries) + { + if (load(lib)) + { + return true; + } + } + + std::cerr << "Couldn't find any OpenCL library.\n"; + return false; +} + +bool CLSymbolsEx::load(const std::string &library) +{ + void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL); + + if (handle == nullptr) + { + std::cerr << "Can't load " << library << ": " << dlerror() << "\n"; + // Set status of loading to failed + _loaded.second = false; + return false; + } + +#define LOAD_FUNCTION_PTR(func_name, handle) \ + func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name)); + + LOAD_FUNCTION_PTR(clGetEventInfo, handle); + LOAD_FUNCTION_PTR(clSetEventCallback, handle); + +#undef LOAD_FUNCTION_PTR + + // Don't call dlclose(handle) or all the symbols will be unloaded ! + + // Disable default loading and set status to successful + _loaded = std::make_pair(true, true); + + return true; +} + +} // namespace arm_compute + +cl_int clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) +{ + arm_compute::CLSymbolsEx::get().load_default(); + auto func = arm_compute::CLSymbolsEx::get().clGetEventInfo_ptr; + if (func != nullptr) + { + return func(event, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clSetEventCallback(cl_event event, cl_int command_exec_callback_type, + void(CL_CALLBACK *pfn_ev_notify)(cl_event ev, cl_int ev_cmd_exec_status, + void *user_data), + void *user_data) +{ + arm_compute::CLSymbolsEx::get().load_default(); + auto func = arm_compute::CLSymbolsEx::get().clSetEventCallback_ptr; + if (func != nullptr) + { + return func(event, command_exec_callback_type, pfn_ev_notify, user_data); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl new file mode 100644 index 000000000..f54c7bde3 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +#define CONST_ONE 1.f +#define DIV_OP(a, b) ((a) / (b)) +#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a))) + +// Inverse Square-root Activation +inline TYPE rsqrt_op(TYPE x) +{ + return RSQRT_OP(x); +} + +#define ACTIVATION_OP2(op, x) op##_op(x) +#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) + +#if defined(ACT) + +/** This performs an activation function floating point inputs. + * + * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH + * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void activation_layer_ex( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load data + TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr); + + // Perform activation + data = ACTIVATION_OP(ACT, data); + + // Store result + VSTORE(VEC_SIZE) + (data, 0, (__global DATA_TYPE *)output.ptr); +} + +#endif /* defined(ACT) */ diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl new file mode 100644 index 000000000..9a6921d7c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform arg_max/arg_min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: U32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs for max value index + * @param[in] dim Dimension across the axis to be reduced. + */ + +__kernel void arg_op(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + DATA_TYPE tval = value; + int idx = 0; + for(int i = 1; i < dim; ++i) + { + indices[axis] = i; + + #if OP_CODE == 1 // ArgMax + value = max(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + #elif OP_CODE == 2 //ArgMin + value = min(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + #else + return; + + #endif + + if(tval!=value) + { + idx = indices[axis]; + tval = value; + } + } + + *((__global uint *)out.ptr) = idx; +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl new file mode 100644 index 000000000..2ed698951 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifdef SATURATE +#define SUB(x, y) sub_sat((x), (y)) +#else /* SATURATE */ +#define SUB(x, y) (x) - (y) +#endif /* SATURATE */ + +/** This function subtracts one tensors from another. + * + * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * + * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16 + * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16 + * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16 + * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void arithmetic_sub_ex( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load values + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + + // Calculate and store result + vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl index 0c0a9ede6..5cd0a4309 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl @@ -2,32 +2,20 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers_asymm.h" -#if defined(FIXED_POINT_POSITION) -#include "fixed_point.h" -#endif /* FIXED_POINT_POSITION */ - #ifdef SATURATE #define ADD(x, y) add_sat((x), (y)) #define SUB(x, y) sub_sat((x), (y)) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl new file mode 100644 index 000000000..ad6a48a02 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) +/** Perform batch to space rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void batch_to_space_nd( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int out_index[4]={0}; + int in_index[4]={0}; + + out_index[0] = get_global_id(0);//W + out_index[1] = get_global_id(1);//H + out_index[2] = get_global_id(2) % DEPTH_OUT;//C + out_index[3] = get_global_id(2) / DEPTH_OUT;//N + + in_index[0] = out_index[0]/BLOCK_SIZE1; + in_index[1] = out_index[1]/BLOCK_SIZE0; + in_index[2] = out_index[2]; + in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl new file mode 100644 index 000000000..bea61f53e --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OP_CODE) && defined(DATA_TYPE) +/** returns truth value of the two input tensors for BINARY LOGICAL OP. + * where BINARY LOGICAL OP can be AND, OR. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor.Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + */ +__kernel void binary_logical_op( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + #if OP_CODE == 1 // LOGICAL AND + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); + + #elif OP_CODE == 2 // LOGICAL OR + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); + + #else // OP NOT SUPPORTED + return + + #endif +} +#endif //if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl index 113804cca..3d4675e5d 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -2,38 +2,34 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" -#ifndef SCALE_IN -#define SCALE_IN 1.0f +#ifndef SCALE +#define SCALE 1.0f +#endif +#ifndef OFFSET +#define OFFSET 0 #endif -#ifndef OFFSET_IN -#define OFFSET_IN 0 +#ifndef VEC_SIZE +#define VEC_SIZE 1 #endif +#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) /** Perform a cast operation on an input tensor. * - * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -65,9 +61,9 @@ __kernel void cast( 0, (__global DATA_TYPE_OUT *)output.ptr); } - /** Perform a cast operation on an QASYMM8 input tensor. - * + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -96,8 +92,8 @@ __kernel void cast_qasymm_in( VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; @@ -108,7 +104,8 @@ __kernel void cast_qasymm_in( /** Perform a cast operation on an QASYMM8 output tensor. - * + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -137,8 +134,8 @@ __kernel void cast_qasymm_out( VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); @@ -146,3 +143,4 @@ __kernel void cast_qasymm_out( VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); } +#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl new file mode 100644 index 000000000..765072556 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) +/** Returns truth value of comparison operators. + * Comparison operators may be equal, not_equal etc. + * + * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT, + * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void comparison_op( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + #if OP_CODE == 1 //EQUAL + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr); + + #elif OP_CODE == 2 //NOT_EQUAL + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl new file mode 100644 index 000000000..1eb305f7b --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" +#define SUB(x, y) (x) - (y) + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) + +/** Returns the truth value of comparison . + * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void comparison_op_qasymm8( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT); + VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT); + + in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1)); + in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2)); + + const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); + const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); + + #if OPCODE == 1 //EQUAL QUANTIZED + VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); + + #elif OPCODE == 2 //NOT EQUAL QUANTIZED + VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl new file mode 100644 index 000000000..fef2243e7 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void depth_to_space( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int out_index[4]={0}; + int in_index[4]={0}; + + out_index[0] = get_global_id(0);//W + out_index[1] = get_global_id(1);//H + out_index[2] = get_global_id(2) % DEPTH_OUT;//C + out_index[3] = get_global_id(2) / DEPTH_OUT;//B + + in_index[0] = out_index[0]/BLOCK_SIZE; + in_index[1] = out_index[1]/BLOCK_SIZE; + in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; + in_index[3] = out_index[3]; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl new file mode 100644 index 000000000..348458fe9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform embedding_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector + */ + +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + //lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) + :get_global_id(0); + lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) + :get_global_id(1); + lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) + :get_global_id(2)%DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + :get_global_id(2) / DEPTH_OUT; + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl new file mode 100644 index 000000000..69d94f30a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Perform an exponential operation on an input tensor. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void exp_layer( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h deleted file mode 100644 index 7807533e2..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h +++ /dev/null @@ -1,565 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_FIXED_POINT_H -#define ARM_COMPUTE_FIXED_POINT_H - -#define TYPE_ALIAS(type, alias) \ - typedef type alias; \ - typedef type alias##x##1; \ - typedef type##2 alias##x##2; \ - typedef type##3 alias##x##3; \ - typedef type##4 alias##x##4; \ - typedef type##8 alias##x##8; \ - typedef type##16 alias##x##16; - -TYPE_ALIAS(char, qs8) -TYPE_ALIAS(short, qs16) -TYPE_ALIAS(int, qs32) - -#define qs8_MIN ((char)CHAR_MIN) -#define qs8_MAX ((char)CHAR_MAX) -#define qs16_MIN ((short)SHRT_MIN) -#define qs16_MAX ((short)SHRT_MAX) -#define qs32_MIN ((int)INT_MIN) -#define qs32_MAX ((int)INT_MAX) - -#define qu8_MIN ((uchar)0) -#define qu8_MAX ((uchar)UCHAR_MAX) -#define qu16_MIN ((ushort)0) -#define qu16_MAX ((ushort)USHRT_MAX) -#define qu32_MIN ((uint)0) -#define qu32_MAX ((uint)UINT_MAX) - -#define qs8_TYPE char -#define qs8x1_TYPE char -#define qs8x2_TYPE char2 -#define qs8x3_TYPE char3 -#define qs8x4_TYPE char4 -#define qs8x8_TYPE char8 -#define qs8x16_TYPE char16 - -#define qs16_TYPE short -#define qs16x1_TYPE short -#define qs16x2_TYPE short2 -#define qs16x3_TYPE short3 -#define qs16x4_TYPE short4 -#define qs16x8_TYPE short8 -#define qs16x16_TYPE short16 - -#define qs32_TYPE int -#define qs32x1_TYPE int -#define qs32x2_TYPE int2 -#define qs32x3_TYPE int3 -#define qs32x4_TYPE int4 -#define qs32x8_TYPE int8 -#define qs32x16_TYPE int16 - -/* All internal constants are represented in the maximum supported fixed point format (QS16), - * thus we define an additional shift parameter required to convert the constant - * from the maximum supported format to the require one. - */ -#define qs8_SHIFT 8 -#define qs16_SHIFT 0 - -#undef VEC_DATA_TYPE_STR -#undef VEC_DATA_TYPE -#undef CONVERT_STR -#undef CONVERT -#undef CONVERT_SAT_STR -#undef CONVERT_SAT - -#define VEC_DATA_TYPE_STR(type, size) type##x##size -#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) - -#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x))) -#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype) -#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE) -#define CONVERT(x, type) CONVERT_STR(x, type) - -#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x))) -#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype) -#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) - -/** Computes saturating absolute value of fixed point vector. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point absolute value. - */ -#define ABSQ_SAT_IMPL(type) \ - inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); } - -ABSQ_SAT_IMPL(qs8x16) -ABSQ_SAT_IMPL(qs16x8) - -#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a)) -#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size) - -/** Computes max of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point maximum. - */ -#define MAXQ_IMPL(type) \ - inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); } - -MAXQ_IMPL(qs8x1) -MAXQ_IMPL(qs8x2) -MAXQ_IMPL(qs8x4) -MAXQ_IMPL(qs8x8) -MAXQ_IMPL(qs8x16) -MAXQ_IMPL(qs16x1) -MAXQ_IMPL(qs16x2) -MAXQ_IMPL(qs16x4) -MAXQ_IMPL(qs16x8) -MAXQ_IMPL(qs16x16) - -#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b)) -#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size) - -/** Computes saturated addition of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point addition. The result is saturated in case of overflow - */ -#define ADDQ_SAT_IMPL(type) \ - inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); } - -ADDQ_SAT_IMPL(qs8x1) -ADDQ_SAT_IMPL(qs8x2) -ADDQ_SAT_IMPL(qs8x4) -ADDQ_SAT_IMPL(qs8x8) -ADDQ_SAT_IMPL(qs8x16) -ADDQ_SAT_IMPL(qs16x1) -ADDQ_SAT_IMPL(qs16x2) -ADDQ_SAT_IMPL(qs16x4) -ADDQ_SAT_IMPL(qs16x8) -ADDQ_SAT_IMPL(qs16x16) -ADDQ_SAT_IMPL(qs32x1) -ADDQ_SAT_IMPL(qs32x2) -ADDQ_SAT_IMPL(qs32x4) -ADDQ_SAT_IMPL(qs32x8) -ADDQ_SAT_IMPL(qs32x16) - -#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b)) -#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size) - -/** Computes saturated subtraction of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point subtraction. The result is saturated in case of overflow - */ -#define SUBQ_SAT_IMPL(type) \ - inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); } - -SUBQ_SAT_IMPL(qs8x1) -SUBQ_SAT_IMPL(qs8x2) -SUBQ_SAT_IMPL(qs8x4) -SUBQ_SAT_IMPL(qs8x8) -SUBQ_SAT_IMPL(qs8x16) -SUBQ_SAT_IMPL(qs16x1) -SUBQ_SAT_IMPL(qs16x2) -SUBQ_SAT_IMPL(qs16x4) -SUBQ_SAT_IMPL(qs16x8) -SUBQ_SAT_IMPL(qs16x16) - -#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b)) -#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size) - -/* Multiply of two fixed point numbers - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiplication. - */ -#define MULQ_IMPL(type, itype) \ - inline type mul_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype round_val = (itype)(1 << (fixed_point_position - 1)); \ - itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \ - return CONVERT((res >> (itype)fixed_point_position), type); \ - } - -MULQ_IMPL(qs8x8, qs16x8) -MULQ_IMPL(qs16x8, qs32x8) -MULQ_IMPL(qs8x16, qs16x16) -MULQ_IMPL(qs16x16, qs32x16) - -#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position)) -#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position) - -/* Saturate multiply of two fixed point numbers - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiplication. The result is saturated in case of overflow - */ -#define MULQ_SAT_IMPL(type, itype) \ - inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype round_val = (itype)(1 << (fixed_point_position - 1)); \ - itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \ - return CONVERT_SAT((res >> (itype)fixed_point_position), type); \ - } - -MULQ_SAT_IMPL(qs8x1, qs16x1) -MULQ_SAT_IMPL(qs8x2, qs16x2) -MULQ_SAT_IMPL(qs8x3, qs16x3) -MULQ_SAT_IMPL(qs8x4, qs16x4) -MULQ_SAT_IMPL(qs8x8, qs16x8) -MULQ_SAT_IMPL(qs8x16, qs16x16) -MULQ_SAT_IMPL(qs16x1, qs32x1) -MULQ_SAT_IMPL(qs16x2, qs32x2) -MULQ_SAT_IMPL(qs16x3, qs32x3) -MULQ_SAT_IMPL(qs16x4, qs32x4) -MULQ_SAT_IMPL(qs16x8, qs32x8) -MULQ_SAT_IMPL(qs16x16, qs32x16) - -#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \ - mul_sat_##type##x##size((a), (b), (position)) -#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \ - MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) - -/** Saturate multiply-accumulate - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiply-accumulate. The result is saturated in case of - * overflow - */ -#define MLAQ_SAT_IMPL(type, itype) \ - type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \ - { \ - itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \ - (itype)(1 << (fixed_point_position - 1))); \ - return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \ - } - -MLAQ_SAT_IMPL(qs8x8, qs16x8) -MLAQ_SAT_IMPL(qs8x16, qs16x16) -MLAQ_SAT_IMPL(qs16x8, qs32x8) - -#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \ - mla_sat_##type##x##size((a), (b), (c), (position)) -#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \ - MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) - -/** Saturate multiply-accumulate long - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiply-accumulate long. The result is saturated in case - * of overflow - */ -#define MLALQ_SAT_IMPL(type, itype) \ - itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \ - { \ - itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \ - (itype)(1 << (fixed_point_position - 1))); \ - return add_sat(VopA, res >> (itype)fixed_point_position); \ - } - -MLALQ_SAT_IMPL(qs8x8, qs16x8) -MLALQ_SAT_IMPL(qs16x8, qs32x8) - -#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \ - mlal_sat_##type##x##size((a), (b), (c), (position)) -#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \ - MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) - -/** Saturate division of two fixed point vectors - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point division. The result is saturated in case of overflow - */ -#define DIVQ_SAT_IMPL(stype, type, itype) \ - inline type div_sat_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype conv_a = CONVERT((VopA), itype); \ - itype denominator = CONVERT((VopB), itype); \ - itype numerator = conv_a << (itype)(fixed_point_position); \ - itype res = select((itype)(numerator / denominator), \ - select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \ - (itype)(denominator == (itype)0)); \ - return CONVERT_SAT((res), type); \ - } - -DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16) -DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8) -DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16) -DIVQ_SAT_IMPL(qs8, qs8, qs16) -DIVQ_SAT_IMPL(qs16, qs16, qs32) - -#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position)) -#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position) - -#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \ - div_sat_##type##x##size((a), (b), (position)) -#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \ - DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) - -/** Saturate exponential of a fixed point vector - * - * @note Implemented approach uses taylor polynomial to approximate the exponential function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point exponential. The result is saturated in case of overflow - */ -#define EXPQ_IMPL(stype, type, size) \ - inline type exp_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1); \ - type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one; \ - type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1); \ - type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1); \ - type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1); \ - type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1); \ - type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position); \ - type dec_m = m >> (type)fixed_point_position; \ - type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, \ - fixed_point_position); \ - alpha = CONVERT(abs_diff(VopA, alpha), type); \ - type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \ - return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), \ - clz(sum) > dec_m); /* Saturate result if needed */ \ - } - -EXPQ_IMPL(qs8, qs8x2, 2) -EXPQ_IMPL(qs8, qs8x4, 4) -EXPQ_IMPL(qs8, qs8x8, 8) -EXPQ_IMPL(qs8, qs8x16, 16) -EXPQ_IMPL(qs16, qs16x2, 2) -EXPQ_IMPL(qs16, qs16x4, 4) -EXPQ_IMPL(qs16, qs16x8, 8) -EXPQ_IMPL(qs16, qs16x16, 16) - -#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position)) -#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position) - -/** Saturate logarithm of a fixed point vector - * - * @note Implemented approach uses taylor polynomial to approximate the logarithm function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point logarithm. The result is saturated in case of overflow - */ -#define LOGQ_IMPL(stype, type, size) \ - inline type log_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */ \ - type A = (type)(0x5C0F >> (14 - fixed_point_position)); /* 1.4384189 */ \ - type B = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */ \ - type C = (type)(0x2933 >> (15 - fixed_point_position)); /* 0.3218538 */ \ - type D = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */ \ - type inter_a = \ - select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \ - VopA < const_one); \ - type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position); \ - inter_a = inter_a >> shift_val; \ - inter_a = sub_sat(inter_a, const_one); \ - type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C); \ - sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B); \ - sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A); \ - sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position); \ - sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, \ - size, fixed_point_position); \ - return select(select(sum, -sum, VopA < const_one), (type)0, \ - VopA < (type)0); /* Saturate result if needed */ \ - } - -LOGQ_IMPL(qs8, qs8x16, 16) -LOGQ_IMPL(qs16, qs16x8, 8) -LOGQ_IMPL(qs16, qs16x16, 16) - -#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position)) -#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position) - -/** Saturate inverse square root of a fixed point vector - * - * @note Implemented approach uses Newton's method to approximate the inverse square root function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point inverse square root. The result is saturated in case of - * overflow - */ -#define INVSQRTQ_IMPL(stype, type, size) \ - inline type invsqrt_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_three = (type)(3 << (fixed_point_position)); \ - type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \ - type temp = select((type)(VopA >> shift_value), \ - select((type)stype##_MAX, (type)(VopA << (-shift_value)), \ - (type)(clz(VopA) > (-shift_value))), \ - (type)(shift_value < (type)0)); \ - type x = temp; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */ \ - { \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - } \ - type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \ - return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), \ - (type)(clz(x) > shift_value2)), \ - (type)(shift_value < (type)0)); /* Saturate result if needed */ \ - } - -INVSQRTQ_IMPL(qs8, qs8x1, 1) -INVSQRTQ_IMPL(qs16, qs16x1, 1) -INVSQRTQ_IMPL(qs8, qs8x16, 16) -INVSQRTQ_IMPL(qs16, qs16x8, 8) - -#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position)) -#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position) - -/** Saturate hyperbolic tangent of a fixed point vector - * - * tanh(x) = (e^2x - 1)/(e^2x + 1) - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of - * overflow - */ -#define TANHQ_IMPL(stype, type, size) \ - inline type tanh_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type const_two = (type)(2 << (fixed_point_position)); \ - type exp2x = \ - EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \ - stype, size, fixed_point_position); \ - type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size); \ - type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size); \ - return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position); \ - } - -TANHQ_IMPL(qs8, qs8x16, 16) -TANHQ_IMPL(qs16, qs16x8, 8) - -#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position)) -#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position) - -#define floatx16 float16 -#define float16_TYPE float16 - -#define CONVERTQ_DOWN_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \ - { \ - return CONVERT(a * (1 << fixed_point_position) + \ - select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \ - out_type); \ - } - -CONVERTQ_DOWN_IMPL(float16, qs8x16) -CONVERTQ_DOWN_IMPL(float16, qs16x16) - -#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \ - { \ - return CONVERT_SAT(a * (1 << fixed_point_position) + \ - select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \ - out_type); \ - } - -CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16) -CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16) - -#define CONVERTQ_UP_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \ - { \ - return CONVERT(a, out_type) / (1 << fixed_point_position); \ - } - -CONVERTQ_UP_IMPL(qs8x16, float16) -CONVERTQ_UP_IMPL(qs16x16, float16) - -#define SQCVT_SAT_IMPL(type) \ - inline type sqcvt_##type##_sat(float a, int fixed_point_position) \ - { \ - return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \ - } - -SQCVT_SAT_IMPL(qs8) -SQCVT_SAT_IMPL(qs16) - -#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position)) -#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position) - -#endif // ARM_COMPUTE_FIXED_POINT_H diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl index 25e20f5f2..6b767d6c9 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl new file mode 100644 index 000000000..ed7409852 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform hashtable_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector + */ +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) + :get_global_id(0); + lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) + :get_global_id(1); + lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) + :get_global_id(2)%DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + :get_global_id(2) / DEPTH_OUT; + + if (lup_id[NUM_DIMS-1] < 0) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); + return; + } + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h index 8143d2398..0e123ae0a 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -24,15 +24,23 @@ #ifndef ARM_COMPUTE_HELPER_H #define ARM_COMPUTE_HELPER_H -#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) -#if defined(ARM_COMPUTE_DEBUG_ENABLED) -#if defined(cl_arm_printf) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) #pragma OPENCL EXTENSION cl_arm_printf : enable -#endif // defined(cl_arm_printf) -#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) #define EXPAND(x) x @@ -175,7 +183,7 @@ typedef struct Tensor4D * * @return An image object */ -Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, +inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) { Vector vector = { @@ -201,7 +209,7 @@ Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ * * @return An image object */ -Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) { Image img = {.ptr = ptr, @@ -230,7 +238,7 @@ Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el * * @return A 3D tensor object */ -Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) @@ -261,7 +269,7 @@ Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, * * @return A 3D tensor object */ -Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) @@ -276,7 +284,7 @@ Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, return tensor; } -Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, uint step_w, uint mod_size) @@ -299,7 +307,7 @@ Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, * @param[in] vec Pointer to the starting position of the buffer * @param[in] x Relative X position */ -__global inline const uchar *vector_offset(const Vector *vec, int x) +inline __global const uchar *vector_offset(const Vector *vec, int x) { return vec->ptr + x * vec->stride_x; } @@ -310,7 +318,7 @@ __global inline const uchar *vector_offset(const Vector *vec, int x) * @param[in] x Relative X position * @param[in] y Relative Y position */ -__global inline uchar *offset(const Image *img, int x, int y) +inline __global uchar *offset(const Image *img, int x, int y) { return img->ptr + x * img->stride_x + y * img->stride_y; } @@ -322,7 +330,7 @@ __global inline uchar *offset(const Image *img, int x, int y) * @param[in] y Relative Y position * @param[in] z Relative Z position */ -__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) { return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; } @@ -335,7 +343,7 @@ __global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int * @param[in] z Relative Z position * @param[in] w Relative W position */ -__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) +inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) { return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl new file mode 100644 index 000000000..e3aa463db --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Performs a negation of input tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * + * @param[in] in_ptr Pointer to the source image. Supported data types: S16/S32/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + */ +__kernel void neg_tensor( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl new file mode 100644 index 000000000..ecf4696e9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * + * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for + * batch), Top(for height), Left(for width) and Front(for depth) are + * required. Supported data type: S32 + */ + +__kernel void pad( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int4 pad_values) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int index[4]={0}; + + index[0] = get_global_id(0);//W + index[1] = get_global_id(1);//H + index[2] = get_global_id(2) % DEPTH_OUT;//C + index[3] = get_global_id(2) / DEPTH_OUT;//N + + if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) || + index[1] < pad_values.y || index[1] >= (IH + pad_values.y) || + index[2] < pad_values.z || index[2] >= (ID + pad_values.z) || + index[3] < pad_values.w || index[3] >= (IB + pad_values.w)) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *) + tensor4D_offset(&in, index[0] - pad_values.x, + index[1] - pad_values.y, + index[2] - pad_values.z, + index[3] - pad_values.w)); + } + } + +#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl new file mode 100644 index 000000000..7cc8b0354 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) +/** Perform a Generic permute operation on an input tensor of Shape DCHW. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1 +6/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in b +ytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in b +ytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in b +ytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void permute_generic( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4]; + int in_index[4]; + in_index[0] = get_global_id(0);//W + in_index[1] = get_global_id(1);//H + in_index[2] = get_global_id(2) % DEPTH_IN;//C + in_index[3] = get_global_id(2) / DEPTH_IN;//B + out_index[0] = in_index[P1]; + out_index[1] = in_index[P2]; + out_index[2] = in_index[P3]; + out_index[3] = in_index[P4]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl index 512c62023..aa05121b1 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl index 82edf3b1d..fdfb78003 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl @@ -2,40 +2,20 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" -#if defined(FIXED_POINT_POSITION) - -#include "fixed_point.h" - -#if defined(SATURATE) -#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) -#else // SATURATE -#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) -#endif // SATURATE - -#else // FIXED_POINT_POSITION - #if defined(SATURATE) #define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x)) #else // SATURATE @@ -45,17 +25,14 @@ #define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size) -#endif // FIXED_POINT_POSITION - /** Performs a pixelwise division with integer scale of integer inputs. * * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. - * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3 * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16 + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/S16 * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) @@ -79,7 +56,7 @@ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1). + * @param[in] scale Integer scaling factor. Supported data types: S32 */ __kernel void pixelwise_div_int( TENSOR3D_DECLARATION(in1), diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl index ddc9d5a27..ab1307e64 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers_asymm.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl new file mode 100644 index 000000000..68da2ba32 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data types : F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] alpha_ptr Pointer to the source image. Supported Data types : F16/F32 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void prelu( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ? + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) : + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), + 0, (__global DATA_TYPE *)output.ptr); + +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl new file mode 100644 index 000000000..7e97b7ed6 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" +#define SUB(x, y) (x) - (y) + +#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) + +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take uchar data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data types : QASYMM8 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] alpha_ptr Pointer to the source image. Supported Data types : QASYMM8 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void prelu_qasymm8( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); + VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); + + in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1)); + in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2)); + + const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); + const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); + const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32; + const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); + const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); + + VSTORE(VEC_SIZE) + (res, 0, (__global uchar *)output.ptr); +} + +#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl deleted file mode 100644 index dfa3b85f4..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#if defined(WIDTH) -/** Perform reduce max - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void reduce_max(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(output)) -{ - Vector input = CONVERT_TO_VECTOR_STRUCT(input); - Vector output = CONVERT_TO_VECTOR_STRUCT(output); - - __global float *input_addr = (__global float *)(input.ptr); - __global float *output_addr = (__global float *)(output.ptr); - - float max_value = *input_addr; - for(int x = 1; x < WIDTH; x++) - { - float value = *(input_addr + x); - max_value = max(value, max_value); - } - - // Store max - *output_addr = max_value; -} -#endif // defined(WIDTH) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl new file mode 100644 index 000000000..8bef49363 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform reduce max/min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + for(int i = 1; i < dim; ++i) + { + indices[axis] = i; + + #if OP_CODE == 1 // REDUCE_MAX + value = max(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + + #elif OP_CODE == 2 // REDUCE_MIN + value = min(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + + #else // OP NOT SUPPORTED + return; + + #endif + } + + *((__global DATA_TYPE *)out.ptr) = value; +} + +/** Perform reduce sum/mean + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE sum_value = (DATA_TYPE)0; + for(int i = 0; i < dim; ++i) + { + indices[axis] = i; + sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + } + + #if OP_CODE == 3 // REDUCE_SUM + *((__global DATA_TYPE *)out.ptr) = sum_value; + + #elif OP_CODE == 4 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl deleted file mode 100644 index 1a96eea61..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -inline DATA_TYPE sum_8(__global const DATA_TYPE *input) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - in = vload8(0, input); - in.s0123 += in.s4567; - in.s01 += in.s23; - return ((in.s0 + in.s1)); -} - -/** This function calculates the sum and sum of squares of a given input image. - * - * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] local_sum Local sum of all elements - * @param[in] height Height of the input image - * @param[in] divider Divider to calculate mean - */ -__kernel void reduction_mean( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst), - __local DATA_TYPE *local_sums, - int height, - int divider) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - float8 tmp_sum = 0; - // Calculate partial sum - - for(int i = 0; i < height; i++) - { - local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i)); - } - ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider; -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl new file mode 100644 index 000000000..a0fc2d5a9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) +/** Perform space to batch with input of 4D and NCHW format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = {0, }; + in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[2] = get_global_id(2) % DEPTH_OUT; + in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; + + if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); + } +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) + +#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) +/** Perform space to batch with input of 4D and NHWC format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = {0, }; + in_index[0] = get_global_id(0) * VEC_SIZE; + in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; + + if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); + } + else + { + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); + } +} + +#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl new file mode 100644 index 000000000..f6977045a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_depth( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4]={0}; + int in_index[4]={0}; + + in_index[0] = get_global_id(0);//W + in_index[1] = get_global_id(1);//H + in_index[2] = get_global_id(2) % DEPTH_IN;//C + in_index[3] = get_global_id(2) / DEPTH_IN;//B + + out_index[0] = in_index[0]/BLOCK_SIZE; + out_index[1] = in_index[1]/BLOCK_SIZE; + out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; + out_index[3] = in_index[3]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl new file mode 100644 index 000000000..3e1a5c97f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Returns true value of squared_difference of two tensors. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] input2_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void squared_difference( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + sq_diff = diff * diff; + + VSTORE(VEC_SIZE) + (sq_diff, 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl deleted file mode 100644 index c5ff82f9e..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - - -inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w) -{ - int stride_x = vector->stride_x; - int stride_y = stride_x * dim_x; - int stride_z = stride_y * dim_y; - int stride_w = stride_z * dim_z; - Tensor4D tensor = - { - .ptr = vector->ptr, - .offset_first_element_in_bytes = vector->offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z, - .stride_w = stride_w, - }; - return tensor; -} - -/** Extracts a strided slice up to 4-dimensions - * - * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short - * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2 - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] dims_in The 4-dimensional dimension of the input. Supported data types: S32 - * @param[in] dims_out The 4-dimensional dimension of the output. Supported data types: S32 - * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 - * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 - */ -__kernel void strided_slice(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(output), - const int4 dims_in, - const int4 dims_out, - const int4 starts, - const int4 strides) -{ - // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset - Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); - Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); - - // Implemenation - // Infer a Tensor4D from output Vector and output's dimensions info - // Infer a Tensor4D from input Vector and input's dimensions info - // Infer indices of output as 4D from the offset of output vector - // Infer indices of input as 4D from indices of output - // out(offset of output vector) = in(offset of input) - - Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w); - Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w); - - // Must be output_step_x == output_stride_x == an element's size - const int offset_out = get_global_id(0) * output_stride_x; - int4 indices_out = - { - get_global_id(0) % dims_out.x, - (offset_out / tensor_out.stride_y) % dims_out.y, - (offset_out / tensor_out.stride_z) % dims_out.z, - (offset_out / tensor_out.stride_w) % dims_out.w, - }; - - int4 indices_in = - { - starts.x + (strides.x * indices_out.x), - starts.y + (strides.y * indices_out.y), - starts.z + (strides.z * indices_out.z), - starts.w + (strides.w * indices_out.w), - }; - - *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl new file mode 100644 index 000000000..b39c55b96 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) +/** Extracts a strided slice up to 4-dimensions + * + * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 + * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 + */ +__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int4 starts, + const int4 strides) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int4 indices_in = + { + starts.x + (strides.x * get_global_id(0)), + starts.y + (strides.y * get_global_id(1)), + starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)), + starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)), + }; + *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); +} +#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl index 0b0cf8218..d97f23a47 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl index deadf8412..0292fab04 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl index cac0c071e..c2c2d89a4 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ // reference: diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp new file mode 100644 index 000000000..1fdd2f98f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/UtilsEx.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::F16, DataType::F32); + + // Checks performed when output is configured + if ((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + if (output != nullptr) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, *input); + } + + const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + bool window_changed = false; + + if (output != nullptr) + { + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + else + { + window_changed = update_window_and_padding( + win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); + } + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLActivationLayerExKernel::CLActivationLayerExKernel() + : _input(nullptr), _output(nullptr), _run_in_place(false) +{ +} + +void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output, + ActivationLayerInfoEx act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _run_in_place = (output == nullptr) || (output == input); + + if (output != nullptr) + { + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), *input->info()->clone()); + } + + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info)); + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const DataType dt = input->info()->data_type(); + float a_const = act_info.a(); + float b_const = act_info.b(); + int a_const_int = 0; + int b_const_int = 0; + + // Create quantized version of constants a, b if needed + if (is_data_type_quantized(dt)) + { + a_const_int = + input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP); + b_const_int = + input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP); + } + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace( + ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation())))); + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized(dt)) + { + build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int))); + build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int))); + + const int o1 = input->info()->quantization_info().offset; + // Quantized value of 0 corresponds to the offset o1 + build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1))); + + // Set scale and offset of the input and output if they have different quantization info + if (is_data_type_quantized_asymmetric(dt) && output != nullptr) + { + const float s1 = input->info()->quantization_info().scale; + const float s2 = output->info()->quantization_info().scale; + const int o2 = output->info()->quantization_info().offset; + + if (o1 != o2 || s1 != s2) + { + build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1))); + build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2))); + build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1))); + build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2))); + } + } + } + else + { + build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); + build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); + } + + build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : ""); + + // Create kernel + std::string kernel_name = std::string("activation_layer_ex"); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Make sure _kernel is initialized before calling the parent's configure + _input = input; + _output = output; + + // Configure kernel window + auto win_config = + validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "activation_layer_ex_"; + _config_id += lower_string(string_from_data_type(dt)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info) +{ + const bool run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), + (run_in_place) ? nullptr : output->clone().get()) + .first); + + return Status{}; +} + +void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + if (!_run_in_place) + { + add_3D_tensor_argument(idx, _output, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp new file mode 100644 index 000000000..c1a2ad0be --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(argminmax_axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32, + DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match argminmax_axis"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + argminmax_axis >= 0 && argminmax_axis < num_dimensions, + "argminmax_axis must be greater than or equal to 0 and less than (input's rank)."); + return Status{}; +} + +} // namespace + +CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {} + +void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis)); + + _input = input; + _output = output; + _argminmax_axis = argminmax_axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis)); + + // Construct kernel name for argmax and argmin based on axis + std::string kernel_name = "arg_op"; + int op_code = 0; + if (op == ArgOperation::MAX) + { + op_code = 1; + } + else if (op == ArgOperation::MIN) + { + op_code = 2; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op)); + + return Status{}; +} + +void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _argminmax_axis); + _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp new file mode 100644 index 000000000..1c505b4d5 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->data_type() == DataType::U8 && + (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, + ITensorInfo *output) +{ + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output, out_shape); + + if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) + { + set_format_if_unknown(*output, Format::S16); + } + else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16) + { + set_format_if_unknown(*output, Format::F16); + } + else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) + { + set_format_if_unknown(*output, Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2); + + AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input1->info(), input2->info(), output->info(), policy)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input1 = input1; + _input2 = input2; + _output = output; + + const bool has_float_out = is_data_type_float(output->info()->data_type()); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts)); + + ICLKernel::configure_internal(win_config.second); +} + +Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), + input2->clone().get(), + output->clone().get()) + .first); + + return Status{}; +} + +void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLArithmeticSubtractionExKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp new file mode 100644 index 000000000..b0016d23c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t *block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2), + "Input Depth should be equal to Output Depth"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3), + "Input batch should be equal to (output batch * block size[0] *block size[1])"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) && + !(output->dimension(1) % block_size[0]), + "Output height and width should be divisible by block size[0] " + "and block_size[1] respectively"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) && + (output->dimension(1) == input->dimension(1) * block_size[0]), + "Output height and width should be equal to " + "input_height*blocksize[0] and input_width*blocksize[1] " + "respectively"); + + return Status{}; +} + +} // namespace + +CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {} + +void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t *block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0])); + build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1])); + build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3))); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_out); + add_4D_tensor_argument(idx, _output, slice_in); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp new file mode 100644 index 000000000..3d2f2c702 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, BinaryLogicalOperation op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "binary_logical_op"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + + int op_code = 0; + switch (op) + { + case BinaryLogicalOperation::AND: + op_code = 1; + break; + case BinaryLogicalOperation::OR: + op_code = 2; + break; + default: + throw std::runtime_error("Operation not supported, yet"); + } + + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLBinaryLogicalOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp index b019e8c33..bf7ebae3f 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp @@ -17,15 +17,8 @@ #include "arm_compute/core/CL/kernels/CLCastKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" using namespace arm_compute; @@ -60,8 +53,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) { const float scale_in = input->info()->quantization_info().scale; const int offset_in = input->info()->quantization_info().offset; - build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); + build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); _kernel = static_cast<cl::Kernel>( CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts)); @@ -70,8 +63,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) { const float scale_in = output->info()->quantization_info().scale; const int offset_in = output->info()->quantization_info().offset; - build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); + build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); _kernel = static_cast<cl::Kernel>( CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts)); @@ -88,7 +81,7 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) update_window_and_padding(win, input_access, output_access); output_access.set_valid_region(win, input->info()->valid_region()); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp new file mode 100644 index 000000000..5af5b16ea --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, const ComparisonOperation &op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "comparison_op"; + int op_code = 0; + + switch (op) + { + case ComparisonOperation::EQUAL: + op_code = 1; + break; + case ComparisonOperation::NOT_EQUAL: + op_code = 2; + break; + default: + throw std::runtime_error(" Operation not supported, yet"); + } + + std::set<std::string> build_opts; + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type()))); + build_opts.emplace( + ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input1->info()->data_type()) && + ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) || + (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale))) + { + build_opts.emplace("-DOFFSET_IN1=" + + support::cpp11::to_string(input1->info()->quantization_info().offset)); + build_opts.emplace("-DOFFSET_IN2=" + + support::cpp11::to_string(input2->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN1=" + + support::cpp11::to_string(input1->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_IN2=" + + support::cpp11::to_string(input2->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input1->info()->data_type() == DataType::S16 || + input2->info()->data_type() == DataType::S16) + { + set_format_if_unknown(*output->info(), Format::S16); + } + else if (input1->info()->data_type() == DataType::F16 && + input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input1->info()->data_type() == DataType::F32 || + input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLComparisonOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp new file mode 100644 index 000000000..c386e3312 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size, + "Output width should be equal to (Input width * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size, + "Output height should be equal to (Input height * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0, + "Input depth should be divisible by (block size * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(2) == input->dimension(2) / (block_size * block_size), + "Output depth should be equal to (Input depth / (block size * block size))"); + + return Status{}; +} +} // namespace + +CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..0862b78bf --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + return Status{}; +} + +void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp new file mode 100644 index 000000000..b1ee21bdc --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLExpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {} + +void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), + input->info()->quantization_info()); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 4; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLExpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp index 23efafa6a..ae2801e2b 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp @@ -17,26 +17,14 @@ #include "arm_compute/core/CL/kernels/CLGatherKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> using namespace arm_compute; namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; +constexpr unsigned int num_elems_processed_per_iteration = 1; Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) @@ -46,6 +34,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); return Status{}; } @@ -57,8 +46,7 @@ CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(n void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); _input1 = input1; _input2 = input2; @@ -89,11 +77,10 @@ void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); // Configure kernel window - const unsigned int num_elems_processed_per_iteration = 1; Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration)); output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp new file mode 100644 index 000000000..cd7b21c6d --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLHashtableLookupKernel::CLHashtableLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Output's shape was not set"); + + ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) && + output->dimension(output->num_dimensions() - 1) == lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + + return Status{}; +} + +void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Make _lookup_indices tensor + _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const_cast<ICLTensor *>(_lookups)->map(queue); + const_cast<ICLTensor *>(_keys)->map(queue); + _hits->map(queue); + _lookup_indices->map(queue); + + // Set values of hits + const int32_t *lookups_buf = + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); + + std::map<int32_t, size_t> key_map; + const size_t keys_num = _keys->info()->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const size_t lookups_num = _lookups->info()->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { +#if defined(DEBUG) + if (it->second >= lookups_num) + ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); +#endif // defined(DEBUG) + lookup_indices_buf[i] = static_cast<int32_t>(it->second); + hits_buf[i] = static_cast<uint8_t>(1); + } + else + { + lookup_indices_buf[i] = -1; + hits_buf[i] = static_cast<uint8_t>(0); + } + } + + const_cast<ICLTensor *>(_lookups)->unmap(queue); + const_cast<ICLTensor *>(_keys)->unmap(queue); + _hits->unmap(queue); + _lookup_indices->unmap(queue); + + Window win = window.collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp new file mode 100644 index 000000000..80d99dd3b --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(), + output->info()->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + return Status{}; +} + +} // namespace + +CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} + +void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp new file mode 100644 index 000000000..12bbe910f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + + // Checks performed when output is configured + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, *input->clone()); + + const unsigned int norm_size = norm_info.norm_size(); + bool is_in_map = norm_info.is_in_map(); + + const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0; + const BorderSize border_size = BorderSize(0, border_width); + + const unsigned int num_elems_processed_per_iteration = 4; + const unsigned int num_elems_read_per_iteration = + is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) + : num_elems_processed_per_iteration; + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside + // the kernel, avoiding padding + AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLNormalizationLayerExKernel::CLNormalizationLayerExKernel() + : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false) +{ +} + +BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; } + +void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), *input->info()->clone()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info)); + + _input = input; + _output = output; + + const unsigned int num_elems_processed_per_iteration = 4; + const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option( + ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff()))); + build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta()))); + build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa()))); + build_opts.add_option( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size()))); + build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2)))); + build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D"); + + // Create kernel + std::string kernel_name = + _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "normalization_layer_"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string( + static_cast<std::underlying_type<NormType>::type>(norm_info.type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(norm_info.norm_size()); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); + + return Status{}; +} + +void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const int collapsed_dimension = _is_in_map ? Window::DimZ : 4; + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension); + Window slice = window_collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } while (window_collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp new file mode 100644 index 000000000..241f8ae4d --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info())); + + _input = input; + _alpha = alpha; + _output = output; + + // Create kernel + std::string kernel_name = "prelu"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + build_opts.emplace("-DOFF_IN1=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_IN2=" + + support::cpp11::to_string(alpha->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN1=" + + support::cpp11::to_string(input->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_IN2=" + + support::cpp11::to_string(alpha->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input->info()->data_type() == DataType::F32 || + alpha->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); + + AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input->info()->tensor_shape(); + const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_input1); + add_3D_tensor_argument(idx, _alpha, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLPReLUKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp new file mode 100644 index 000000000..99b54c822 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info, + const ITensorInfo *pad_size_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 && + input_info->num_dimensions() <= 4, + "Pad kernel supports upto 4-D input tensor"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input_info->num_dimensions() == output_info->num_dimensions(), + "output tensor should have same number of dimensions as input tensor"); + + if (input_info->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() != + output_info->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {} + +void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info())); + + _input = input; + _output = output; + _pad_size = pad_size; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3))); + build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + _pad_size->map(queue); + + // Padding values only for up, top, left and front are required based on the rank of tensor + int rank = _pad_size->info()->dimension(1); + + auto pad_batch_up = + (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0; + auto pad_height_top = + (rank >= 2) + ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1})) + : 0; + auto pad_width_left = (rank >= 1) + ? *reinterpret_cast<const int32_t *>( + _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1})) + : 0; + auto pad_depth_front = + (rank >= 3) + ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3})) + : 0; + + _pad_size->unmap(queue); + + // Pad_values which needs to be passed + const cl_int4 paddingValues = { + {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top), + static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}}; + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + _kernel.setArg<cl_int4>(idx++, paddingValues); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp new file mode 100644 index 000000000..aa094761c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +using namespace arm_compute; + +namespace +{ +TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm) +{ + TensorShape output_shape = input->tensor_shape(); + permute(output_shape, perm); + return output_shape; +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + const TensorShape output_shape = + misc::shape_calculator::compute_permutation_output_shape(*input, perm); + + // Validate configured output + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + return Status{}; +} +} // namespace + +CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {} + +void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm)); + + _input = input; + _output = output; + _perm = perm; + + const TensorShape output_shape = get_output_shape(input->info(), perm); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + + // Create kernel + std::set<std::string> build_opts; + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + + // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector + build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0])); + build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1])); + build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2])); + build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3])); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + // The CLPermute doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm)); + + return Status{}; +} + +void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp index a3e0163de..b985aa737 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp @@ -17,20 +17,8 @@ #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> using namespace arm_compute; @@ -45,12 +33,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, DataType::F16, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); const TensorShape &out_shape = @@ -58,21 +44,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2); - - if (is_data_type_fixed_point(input1->data_type())) - { - // All data types must be all QS8 or all QS16 - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1, - "Unsupported scaling factor for QS8/QS16. Scale must be 1."); - } // Validate in case of configured output if (output->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG( output->data_type() == DataType::U8 && @@ -81,11 +57,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG( detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output); - if (is_data_type_fixed_point(input1->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - } } return Status{}; @@ -191,14 +162,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens { compute_type = "int"; } - else if (input1->info()->data_type() == DataType::QS8) - { - compute_type = "qs8"; - } - else if (input1->info()->data_type() == DataType::QS16) - { - compute_type = "qs16"; - } else { compute_type = "ushort"; @@ -218,11 +181,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens : "-DSATURATE"); build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte"); - if (is_data_type_fixed_point(input1->info()->data_type())) - { - build_opts.emplace("-DFIXED_POINT_POSITION=" + - support::cpp11::to_string(input1->info()->fixed_point_position())); - } build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); @@ -245,7 +203,7 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens _kernel.setArg(idx++, scale); } - ICLKernel::configure(win_config.second); + ICLKernel::configure_internal(win_config.second); } Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp deleted file mode 100644 index 168b246bf..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) -{ - // We can handle for simple case only - // Input rank: 2 - // Output rank: 1 - // Axis: one axis value, restrict to 1 - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Inputs are not broadcast compatible"); - - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(), - "Output same type allowed for input and output"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1, - "Only support for output dimension 1"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2, - "Only support for input dimension 2"); - } - - return Status{}; -} - -} // namespace - -CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {} - -void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info())); - - _input = input; - _output = output; - _axis = axis; - - // Configure kernel window - int cols = _input->info()->tensor_shape()[0]; - int rows = _input->info()->tensor_shape()[1]; - Window win; - win.set(0, Window::Dimension(0, cols, 1)); - win.set(1, Window::Dimension(0, rows, 1)); - - // Construct kernel name - std::string kernel_name = "reduce_max"; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols)); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - ICLKernel::configure(win); -} - -Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis, - const ITensorInfo *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output)); - - return Status{}; -} - -void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window window_input = window; - Window slice_input = window_input.first_slice_window_1D(); - - do - { - Window slice_output = slice_input.shift_dimensions(1); - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, slice_input); - add_1D_tensor_argument(idx, _output, slice_output); - enqueue(queue, *this, slice_input); - - } while (window_input.slide_window_slice_1D(slice_input)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp new file mode 100644 index 000000000..f581780e1 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; +namespace +{ +// NOTE This is necessary because it is not guaranteed that the axis positions of input and output +// are the same. +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32, DataType::S32); + if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, + "Not support QASYMM8, yet"); + } + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + axis >= 0 && axis < num_dimensions, + "axis must be greater than or equal to 0 and less than (input's rank)."); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + return Status{}; +} +} // namespace + +CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel name + std::string kernel_name; + int op_code = 0; + if (op == ReduceOperation::MAX) + { + kernel_name = "reduce_min_max"; + op_code = 1; + } + else if (op == ReduceOperation::MIN) + { + kernel_name = "reduce_min_max"; + op_code = 2; + } + else if (op == ReduceOperation::SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 3; + } + else if (op == ReduceOperation::MEAN) + { + kernel_name = "reduce_sum_mean"; + op_code = 4; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + // Support dimensions up to 4 + Window slice_out = window.collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions + // of input and output are the same + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp deleted file mode 100644 index 84a77122d..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/FixedPoint.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions, - "Reduction axis greater than max number of dimensions"); - - std::vector<uint32_t>::const_iterator it; - bool axis_w = false; - bool axis_h = false; - for (it = axis.begin(); it != axis.end(); ++it) - { - if ((*it) == 0) - { - axis_w = true; - } - else if ((*it) == 1) - { - axis_h = true; - } - else - { - ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); - } - } - // TODO Other axises (currently, only axises for both width and height are supported.) - if (!axis_w || !axis_h) - { - ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); - } - - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW); - } - - return Status{}; -} - -std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, - std::vector<uint32_t> axis) -{ - // Output tensor auto initialization if not yet initialized - TensorShape output_shape{input->tensor_shape()}; - output_shape.set(0, 1); - output_shape.set(1, 1); - auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(), - input->fixed_point_position()); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step - const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); - - Window win = calculate_max_window( - *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, - num_elems_processed_per_iteration_y); - AccessWindowHorizontal output_access(output, 0, 1); - bool window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, output->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - - return std::make_tuple(err, win); -} -} // namespace - -CLReductionMeanKernel::CLReductionMeanKernel() - : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size() -{ -} - -BorderSize CLReductionMeanKernel::border_size() const { return _border_size; } - -void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis)); - - _input = input; - _output = output; - _reduction_axis = axis; - - constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step - - // Set border size - _border_size = BorderSize( - ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - - input->info()->dimension(0)); - - // Set build options - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - // build_opts.emplace(("-DVEC_SIZE=" + - // support::cpp11::to_string(num_elems_processed_per_iteration))); - if (is_data_type_fixed_point(input->info()->data_type())) - { - build_opts.emplace("-DFIXED_POINT_POSITION=" + - support::cpp11::to_string(input->info()->fixed_point_position())); - } - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts)); - - // Configure kernel window - auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - ICLKernel::configure(std::get<1>(win_config)); -} - -Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>( - validate_and_configure_window(input->clone().get(), output->clone().get(), axis))); - - return Status{}; -} - -void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - // Set out window - Window out_window(window); - out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - - // Get first input and output slices - Window in_slice = window.first_slice_window_2D(); - Window out_slice = out_window.first_slice_window_2D(); - - // Set local sums buffer - // TODO work_group - unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size(); - - unsigned int idx = 2 * num_arguments_per_2D_tensor(); - _kernel.setArg(idx++, local_sum_size, nullptr); - _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height - _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) * - _input->info()->dimension(1))); // divider - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1)); - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice); - } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp new file mode 100644 index 000000000..6b0697e89 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size, + const ITensorInfo *padding_size, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(), + "The number of dimensions of input should be equal to output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(), + "The input and output layouts are different!"); + + // TODO Support other cases + if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4, + "CLSpaceToBatchNDKernel supports dimensions up to 4"); + + if (input->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info())); + + _input = input; + _block_size = block_size; + _padding_size = padding_size; + _output = output; + + // Set kernel build options + // TODO Support other cases + std::string kernel_name = "space_to_batch_4d"; + std::set<std::string> build_opts; + Window win; + + if (input->info()->data_layout() == DataLayout::NCHW) + { + kernel_name += "_nchw"; + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0))); + + win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + } + else if (input->info()->data_layout() == DataLayout::NHWC) + { + kernel_name += "_nhwc"; + build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + + win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->info()->valid_region()); + + if (window_changed) + { + ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!"); + } + } + else + { + ARM_COMPUTE_ERROR("Unsupported layout"); + } + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + ICLKernel::configure_internal(win); +} + +void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + +#if defined(DEBUG) + const_cast<ICLTensor *>(_block_size)->map(queue); + const_cast<ICLTensor *>(_padding_size)->map(queue); + + const size_t num_dimensions = _input->info()->num_dimensions(); + const size_t num_spacial_dimensions = _block_size->info()->dimension(0); + int32_t batch_size = _input->info()->dimension(num_dimensions - 1); + for (size_t i = 0; i < num_spacial_dimensions; ++i) + { + const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i})); + const int32_t padding_size_pre = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i})); + const int32_t padding_size_post = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i})); + + ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1"); + ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0, + "Padding size should be greater than or equal to 0"); + + if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(i) != + (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + else + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) != + (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) + + padding_size_pre + padding_size_post) / + block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + + batch_size *= block_size; + } + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - 1) != batch_size, + "Output batch size should be equal to input batch size * (multiplication of all block size)"); + + const_cast<ICLTensor *>(_block_size)->unmap(queue); + const_cast<ICLTensor *>(_padding_size)->unmap(queue); +#endif // defined(DEBUG) + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Set block size window + Window win_block = calculate_max_window(*_block_size->info(), Steps()); + + // Set padding size window + Window win_padding = calculate_max_window(*_padding_size->info(), Steps()); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + add_1D_tensor_argument(idx, _block_size, win_block); + add_2D_tensor_argument(idx, _padding_size, win_padding); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp new file mode 100644 index 000000000..5d6329edc --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3), + "Input batch should be equal to Output batch"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input->dimension(2) * block_size * block_size == output->dimension(2), + "Output depth should be equal to (input depth * block size *block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) && + !(input->dimension(1) % block_size), + "Input height and width should be divisible by block size"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) && + (output->dimension(1) == (input->dimension(1) / block_size)), + "Output height and width should be equal to " + "input_height/blocksize and input_width/blocksize respectively"); + + return Status{}; +} + +} // namespace + +CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp new file mode 100644 index 000000000..260bc39f1 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLSquaredDifferenceKernel::CLSquaredDifferenceKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input1->info()->data_type() == DataType::F16 && + input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input1->info()->data_type() == DataType::F32 || + input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLSquaredDifferenceKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp index 80ffd423a..48146a43a 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp @@ -14,43 +14,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" +#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include <string> - -using namespace std; using namespace arm_compute; -static const int32_t maxDim = 4; - -CLStridedSliceKernel::CLStridedSliceKernel() +CLStridedSliceExKernel::CLStridedSliceExKernel() : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0) { } -Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *begin, const ITensorInfo *end, - const ITensorInfo *strides, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) +Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *begin, const ITensorInfo *end, + const ITensorInfo *strides, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, - DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32); @@ -153,15 +140,6 @@ inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, return stop; } -inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) -{ - int32_t offset = b * shape[2] * shape[1] * shape[0]; - offset += d * shape[1] * shape[0]; - offset += h * shape[0]; - offset += w; - return offset; -} - inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) { int32_t ret = 0; @@ -177,10 +155,10 @@ inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) return ret; } -void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, - ICLTensor *beginData, ICLTensor *endData, - ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask) +void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output, + ICLTensor *beginData, ICLTensor *endData, + ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), beginMask, endMask, @@ -195,48 +173,31 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, _endMask = endMask; _shrinkAxisMask = shrinkAxisMask; - constexpr unsigned int num_elems_processed_per_iteration = 1; - // Set kernel build options std::set<std::string> build_opts; build_opts.emplace("-DELEMENT_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts)); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts)); - // Create output's window without padding - TensorShape collapsed = output->info()->tensor_shape(); - collapsed.collapse(4); - TensorInfo info = *output->info(); - info.set_tensor_shape(collapsed); - Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration)); - - ICLKernel::configure(win); + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + ICLKernel::configure_internal(win); } -void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) +void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - // Create input window - TensorShape collapsed = _input->info()->tensor_shape(); - collapsed.collapse(4); - TensorInfo info = *_input->info(); - info.set_tensor_shape(collapsed); - Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size())); - _beginData->map(queue); _endData->map(queue); _stridesData->map(queue); - std::vector<int32_t> dimsIn; - std::vector<int32_t> dimsOut; std::vector<int32_t> starts; - std::vector<int32_t> stops; std::vector<int32_t> strides; for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n) @@ -246,22 +207,13 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n], reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n)); - stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n], - reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, - n)); - strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]); - dimsIn.emplace_back(shape[n]); - dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n])); } for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) { starts.emplace_back(0); - stops.emplace_back(1); strides.emplace_back(1); - dimsIn.emplace_back(1); - dimsOut.emplace_back(1); } // TODO: Apply shrinkAxisMask @@ -269,20 +221,7 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) _stridesData->unmap(queue); _endData->unmap(queue); - // Set parameters - unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters - const cl_int4 dimsInArg = {{ - static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]), - static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]), - }}; - _kernel.setArg<cl_int4>(idx++, dimsInArg); - - const cl_int4 dimsOutArg = {{ - static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]), - static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]), - }}; - _kernel.setArg<cl_int4>(idx++, dimsOutArg); - + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters const cl_int4 startsArg = {{ static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]), static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]), @@ -295,10 +234,20 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) }}; _kernel.setArg<cl_int4>(idx++, stridesArg); - // TODO: Apply slicing output's window - idx = 0; - add_1D_tensor_argument(idx, _input, win_in); - add_1D_tensor_argument(idx, _output, window); + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); - enqueue(queue, *this, window); + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); } diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp index d95b485b7..073c2f7bb 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -17,15 +17,8 @@ #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <climits> -#include <cassert> namespace arm_compute { @@ -59,7 +52,7 @@ void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTens // Configure kernel window Window win; win.set(0, Window::Dimension(0, 1, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) @@ -102,7 +95,7 @@ void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffe // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) @@ -147,7 +140,7 @@ void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) // Configure kernel window Window win; win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -192,7 +185,7 @@ void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_ // Configure kernel window Window win; win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -236,7 +229,7 @@ void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buf // Configure kernel window Window win; win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -275,7 +268,7 @@ void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob // Configure kernel window Window win; win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -322,7 +315,7 @@ void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) // Configure kernel window Window win; win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) @@ -365,7 +358,7 @@ void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, in // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) @@ -404,7 +397,7 @@ void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) @@ -449,7 +442,7 @@ void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int // Configure kernel window Window win; win.set(0, Window::Dimension(0, k, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) diff --git a/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp new file mode 100644 index 000000000..3b5782c25 --- /dev/null +++ b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEMath.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, + const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared); + + // Checks performed when output is configured + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *input_squared, + ITensorInfo *output, + const NormalizationLayerInfo &norm_info) +{ + unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); + const unsigned int num_elems_read_per_iteration = + num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); + const unsigned int num_rows = + (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1; + const unsigned int border_width = + (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U); + BorderSize border_size = BorderSize(0, border_width); + bool window_changed = false; + + // Configure window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration, + num_rows); + AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0, + num_elems_read_per_iteration, num_rows); + + if (output->total_size() != 0) + { + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = + update_window_and_padding(win, input_access, input_squared_access, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + else + { + window_changed = update_window_and_padding(win, input_access, input_squared_access); + } + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +NENormalizationLayerExKernel::NENormalizationLayerExKernel() + : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), + _norm_info(NormType::IN_MAP_1D), _border_size() +{ +} + +BorderSize NENormalizationLayerExKernel::border_size() const { return _border_size; } + +void NENormalizationLayerExKernel::configure(const ITensor *input, const ITensor *input_squared, + ITensor *output, NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output); + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), *input->info()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), input_squared->info(), output->info(), norm_info)); + + const unsigned int border_width = + (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U); + + _input = input; + _input_squared = input_squared; + _output = output; + _norm_info = norm_info; + _border_size = BorderSize(0, border_width); + + switch (_input->info()->data_type()) + { + case DataType::F32: + { + switch (norm_info.type()) + { + case NormType::IN_MAP_1D: + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, false>; + break; + case NormType::IN_MAP_2D: + // Normalize over X and Y + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, true>; + break; + case NormType::CROSS_MAP: + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 2, false>; + break; + default: + break; + } + break; + } + case DataType::F16: + { + switch (norm_info.type()) + { + case NormType::IN_MAP_1D: + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, false>; + break; + case NormType::IN_MAP_2D: + // Normalize over X and Y + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, true>; + break; + case NormType::CROSS_MAP: + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 2, false>; + break; + default: + break; + } + break; + } + default: + ARM_COMPUTE_ERROR("NOT SUPPORTED!"); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), input_squared->info(), + output->info(), norm_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +template <DataType dt, unsigned int dim, bool do_2D_norm> +void NENormalizationLayerExKernel::normalize_float(const Window &window) +{ + Iterator input(_input, window); + Iterator input_squared(_input_squared, window); + Iterator output(_output, window); + + const int dim_y = 1; + const int radius = _norm_info.norm_size(); + const int total_size = _input->info()->dimension(dim) - 1; + const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim]; + // We account padding across X only and we iterate over rows + const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left); + const int max_right = (dim == 2) ? total_size : total_size + border_size().left; + const int min_top = 0; + const int max_bottom = _input->info()->dimension(dim_y) - 1; + + if (dt == DataType::F32) + { + const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff()); + const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta()); + const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa()); + + execute_window_loop( + window, + [&](const Coordinates &id) { + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int current_slice = id[dim]; + const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + const int first_slice = std::max(current_slice - radius, min_left); + const int last_slice = std::min(current_slice + radius, max_right); + + // Accumulate 2D In-Map values + float32x4_t accu = vdupq_n_f32(0.f); + for (int j = first_row; j <= last_row; j++) + { + // Compute row displacement + const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; + const uint8_t *const input_squared_ptr = + input_squared.ptr() + row - (current_slice * input_squared_stride); + for (int i = first_slice; i <= last_slice; ++i) + { + accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>( + input_squared_ptr + i * input_squared_stride))); + } + } + + // Normalize + const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec); + const float32x4_t normalized_pixel = vmulq_f32( + vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized)); + vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel); + }, + input, input_squared, output); + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else if (dt == DataType::F16) + { + const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff()); + const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta()); + const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa()); + + execute_window_loop( + window, + [&](const Coordinates &id) { + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int current_slice = id[dim]; + const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + const int first_slice = std::max(current_slice - radius, min_left); + const int last_slice = std::min(current_slice + radius, max_right); + + // Accumulate 2D In-Map values + float16x8_t accu = vdupq_n_f16(0.f); + for (int j = first_row; j <= last_row; j++) + { + // Compute row displacement + const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; + const uint8_t *const input_squared_ptr = + input_squared.ptr() + row - (current_slice * input_squared_stride); + for (int i = first_slice; i <= last_slice; ++i) + { + accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>( + input_squared_ptr + i * input_squared_stride))); + } + } + + const float16x8_t norm_f16 = + vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16); + const float16x8_t normalized_pixel = vmulq_f16( + vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16)); + vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel); + }, + input, input_squared, output); + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + else + { + ARM_COMPUTE_ERROR("Not supported"); + } +} + +Status NENormalizationLayerExKernel::validate(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + const NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + input_squared->clone().get(), + output->clone().get(), norm_info) + .first); + + return Status{}; +} + +void NENormalizationLayerExKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + // Run function + (this->*_func)(window); +} diff --git a/libs/ARMComputeEx/src/core/UtilsEx.cpp b/libs/ARMComputeEx/src/core/UtilsEx.cpp new file mode 100644 index 000000000..b63093bbb --- /dev/null +++ b/libs/ARMComputeEx/src/core/UtilsEx.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/UtilsEx.h" + +#include <cstdint> +#include <fstream> +#include <map> +#include <string> + +using namespace arm_compute; + +const std::string & +arm_compute::string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act) +{ + static std::map<ActivationLayerInfoEx::ActivationFunction, const std::string> act_map = { + {ActivationLayerInfoEx::ActivationFunction::RSQRT, "RSQRT"}, + }; + + return act_map[act]; +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp new file mode 100644 index 000000000..1e52fc429 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLActivationLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h" + +using namespace arm_compute; + +void CLActivationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ActivationLayerInfoEx act_info) +{ + auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerExKernel>(); + k->configure(input, output, act_info); + _kernel = std::move(k); +} + +Status CLActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info) +{ + return CLActivationLayerExKernel::validate(input, output, act_info); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp new file mode 100644 index 000000000..dff743e89 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLArgMinMax.h" + +#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +namespace arm_compute +{ + +CLArgMinMax::CLArgMinMax() + : _input(nullptr), _output(nullptr), _argminmax_axis(), _interm_tensors(), _argminmax_kernels(), + _num_of_kernels() +{ +} + +void CLArgMinMax::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, + ArgOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op)); + _input = input; + _output = output; + _argminmax_axis = axis; + _arg_op = op; + // NOTE The argminmax_axis must have no duplication. + _num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = _num_of_kernels - 1; + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _argminmax_kernels = + arm_compute::support::cpp14::make_unique<CLArgMinMaxKernel[]>(_num_of_kernels); + + TensorShape shape{input->info()->tensor_shape()}; + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(_argminmax_axis[i], 1); + _interm_tensors[i].allocator()->init( + TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); + _interm_tensors[i].allocator()->allocate(); + } + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ArgMinMax on all kernels + for (size_t i = 0; i < _num_of_kernels; i++) + { + _argminmax_kernels[i].configure(tensors[i], tensors[i + 1], _argminmax_axis[i], op); + } +} + +Status CLArgMinMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis, + const ITensorInfo *output, ArgOperation op) +{ + const size_t num_of_kernels = argminmax_axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(argminmax_axis[i], 1); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate argminmax only on all kernels + for (size_t i = 0; i < num_of_kernels; i++) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxKernel::validate(tensors[i], tensors[i + 1], argminmax_axis[i], op)); + } + + return Status{}; +} + +void CLArgMinMax::run() +{ + for (size_t i = 0; i < _num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_argminmax_kernels[i]); + } +} + +} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp new file mode 100644 index 000000000..3f403c80a --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h" + +using namespace arm_compute; + +void CLArithmeticSubtractionEx::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + ConvertPolicy policy) +{ + auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionExKernel>(); + k->configure(input1, input2, output, policy); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} + +Status CLArithmeticSubtractionEx::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy) +{ + return CLArithmeticSubtractionExKernel::validate(input1, input2, output, policy); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp new file mode 100644 index 000000000..26e3798cc --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLBatchToSpaceND.h" + +#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h" + +using namespace arm_compute; + +void CLBatchToSpaceND::configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLBatchToSpaceNDKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp new file mode 100644 index 000000000..7c5fe5eda --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h" + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op) +{ + auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp index e1059ab53..8e106737c 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp @@ -17,7 +17,6 @@ #include "arm_compute/runtime/CL/functions/CLCast.h" #include "arm_compute/core/CL/kernels/CLCastKernel.h" -#include "support/ToolchainSupport.h" using namespace arm_compute; diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp new file mode 100644 index 000000000..f6a745a25 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLComparisonOp.h" + +#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLComparisonOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + const ComparisonOperation &op) +{ + auto k = arm_compute::support::cpp14::make_unique<CLComparisonOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp new file mode 100644 index 000000000..c2e4ca9ff --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h" + +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +using namespace arm_compute; + +void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp new file mode 100644 index 000000000..2781784ca --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +using namespace arm_compute; + +void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp new file mode 100644 index 000000000..411fa8700 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLExp.h" + +#include "arm_compute/core/CL/kernels/CLExpKernel.h" + +using namespace arm_compute; + +void CLExp::configure(const ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLExpKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp index 5552cbc6f..fb056fe45 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp @@ -16,11 +16,7 @@ */ #include "arm_compute/runtime/CL/functions/CLGather.h" -#include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/kernels/CLGatherKernel.h" -#include "support/ToolchainSupport.h" - -#include <utility> using namespace arm_compute; diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp new file mode 100644 index 000000000..7180e9356 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h" + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +using namespace arm_compute; + +void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp new file mode 100644 index 000000000..be35ea732 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLNeg.h" + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +using namespace arm_compute; + +void CLNeg::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp new file mode 100644 index 000000000..276c4557a --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLNormalizationLayerEx::CLNormalizationLayerEx() : _norm_kernel(), _border_handler() {} + +void CLNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + const NormalizationLayerInfo &norm_info) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr); + + // Configure normalization kernel + _norm_kernel.configure(input, output, norm_info); + + // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel + _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0)); +} + +Status CLNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) +{ + return CLNormalizationLayerExKernel::validate(input, output, norm_info); +} + +void CLNormalizationLayerEx::run() +{ + // Run border handler + CLScheduler::get().enqueue(_border_handler, false); + + // Run normalization kernel + CLScheduler::get().enqueue(_norm_kernel); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp new file mode 100644 index 000000000..38adedd10 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLPReLU.h" + +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>(); + k->configure(input, alpha, output); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp new file mode 100644 index 000000000..5265b6c34 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp @@ -0,0 +1,28 @@ +/* +* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +* Copyright (c) 2016-2018 ARM Limited. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" + +using namespace arm_compute; + +void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPadLayerKernel>(); + k->configure(input, output, pad_size); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp new file mode 100644 index 000000000..fb363270d --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLPermuteEx.h" + +#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h" + +using namespace arm_compute; + +void CLPermuteEx::configure(const ICLTensor *input, ICLTensor *output, + const PermutationVector &perm) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPermuteExKernel>(); + k->configure(input, output, perm); + _kernel = std::move(k); +} + +Status CLPermuteEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteExKernel::validate(input, output, perm)); + return Status{}; +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp index e1add5e90..dc0baa8dd 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp @@ -18,9 +18,6 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" -#include "support/ToolchainSupport.h" - -#include <utility> using namespace arm_compute; diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp deleted file mode 100644 index 3382058db..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLReduceMax.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "support/ToolchainSupport.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" - -#include <vector> -#include <algorithm> - -#include <utility> - -#define REDUCE_MAX_RUN_ON_CPU 1 - -namespace arm_compute -{ - -CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {} - -void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output) -{ - _axis = axis; - - _input = input; - _output = output; - - auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>(); - k->configure(input, axis, output); - _kernel = std::move(k); - - // We can handle for simple case only - // Output rank: 1 - // Axis: one axis value, restrict to 1 - ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2); - ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1); - ARM_COMPUTE_ERROR_ON(axis != 1); -} - -Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) -{ - return CLReduceMaxKernel::validate(input, axis, output); -} - -void CLReduceMax::run() -{ -#if REDUCE_MAX_RUN_ON_CPU - run_on_cpu(); - - arm_compute::CLScheduler::get().sync(); -#else - arm_compute::CLScheduler::get().enqueue(*_kernel); -#endif -} - -void CLReduceMax::run_on_cpu() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - _input->map(q); - _output->map(q); - - // Compute by CPU for simple case - // Input rank: 2 - // Output rank: 1 - // Axis: one axis value, restrict to 1 - - float *input_data = (float *)_input->buffer(); - float *output_data = (float *)_output->buffer(); - - std::vector<float> container_max; - int cols = _input->info()->tensor_shape()[0]; - int rows = _input->info()->tensor_shape()[1]; - container_max.resize(rows); - - // Initialize as 1st element in row - float *input_pointer = input_data; - for (int i = 0; i < rows; i++) - { - container_max[i] = *input_pointer; - input_pointer += cols; - } - - // Update max value in row - for (int i = 0; i < rows; i++) - { - float max_in_row = container_max[i]; - for (int j = 1; j < cols; j++) - { - if (max_in_row < input_data[i * cols + j]) - { - max_in_row = input_data[i * cols + j]; - } - } - container_max[i] = max_in_row; - } - - for (int i = 0; i < rows; i++) - { - output_data[i] = container_max[i]; - } - - _input->unmap(q); - _output->unmap(q); -} -} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp new file mode 100644 index 000000000..2b8d82706 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLReduceOperation.h" + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLReduceOperation::CLReduceOperation() + : _input(nullptr), _output(nullptr), _axis(), _interm_tensors(), _reduce_kernels() +{ +} + +Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, const ReduceOperation &op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate ReduceOperation only on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + } + + return Status{}; +} + +void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, + const std::set<uint32_t> &axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op)); + + _axis = axis; + + _input = input; + _output = output; + + // NOTE The axis must have no duplication. + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = + arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + + TensorShape shape{input->info()->tensor_shape()}; + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1); + _interm_tensors[i].allocator()->init( + TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); + _interm_tensors[i].allocator()->allocate(); + } + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ReduceOperation on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op); + } +} + +void CLReduceOperation::run() +{ + const size_t num_of_kernels = _axis.size(); + for (size_t i = 0; i < num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_reduce_kernels[i]); + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp deleted file mode 100644 index ab724e752..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLReductionMean.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/Tensor.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -CLReductionMean::CLReductionMean() : _reduction_mean_kernel(), _fill_border_kernel() {} - -Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis)); - return Status{}; -} - -void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis) -{ - _reduction_mean_kernel.configure(input, output, axis); - _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT, - PixelValue(0)); -} - -void CLReductionMean::run() -{ - CLScheduler::get().enqueue(_fill_border_kernel); - CLScheduler::get().enqueue(_reduction_mean_kernel); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp new file mode 100644 index 000000000..c03826891 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +using namespace arm_compute; + +void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>(); + k->configure(input, block_size, padding_size, output); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp new file mode 100644 index 000000000..0f455f96f --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +using namespace arm_compute; + +void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp new file mode 100644 index 000000000..dc6e4af44 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSquaredDifference.h" + +#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLSquaredDifference::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSquaredDifferenceKernel>(); + k->configure(input1, input2, output); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp deleted file mode 100644 index cd576cec1..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * Copyright 2018 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLStridedSlice.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" -#include "arm_compute/core/utils/misc/Utility.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/ToolchainSupport.h" -#include <vector> - -using namespace arm_compute; - -static const int32_t maxDims = 4; - -// Return the index for the first element along that axis. This index will be a -// positive integer between [0, axisSize - 1] that can be used to index -// directly into the data. -inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices, - std::vector<int32_t> const &strides, const TensorShape &inputShape, - int32_t axis) -{ - // Begin with the specified index - int32_t start = startIndices[axis]; - - // beginMask override - if (beginMask & 1 << axis) - { - if (strides[axis] > 0) - { - // Forward iteration - use the first element. These values will get - // clamped below (Note: We could have set them to 0 and axisSize-1, but - // use lowest() and max() to maintain symmetry with StopForAxis()) - start = std::numeric_limits<int32_t>::lowest(); - } - else - { - // Backward iteration - use the last element. - start = std::numeric_limits<int32_t>::max(); - } - } - - // Handle negative indices - int32_t axisSize = inputShape[axis]; - if (start < 0) - { - start += axisSize; - } - - // Clamping - start = arm_compute::utility::clamp(start, 0, axisSize - 1); - - return start; -} - -// Return the "real" index for the end of iteration along that axis. This is an -// "end" in the traditional C sense, in that it points to one past the last -// element. ie. So if you were iterating through all elements of a 1D array of -// size 4, this function would return 4 as the stop, because it is one past the -// "real" indices of 0, 1, 2 & 3. -inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices, - std::vector<int32_t> const &strides, const TensorShape &inputShape, - int32_t axis) -{ - // Begin with the specified index - int32_t stop = stopIndices[axis]; - - // endMask override - if (endMask & (1 << axis)) - { - if (strides[axis] > 0) - { - // Forward iteration - use the last element. These values will get - // clamped below - stop = std::numeric_limits<int32_t>::max(); - } - else - { - // Backward iteration - use the first element. - stop = std::numeric_limits<int32_t>::lowest(); - } - } - - // Handle negative indices - int32_t axisSize = inputShape[axis]; - if (stop < 0) - { - stop += axisSize; - } - - // Clamping - // Because the end index points one past the last element, we need slightly - // different clamping ranges depending on the direction. - if (strides[axis] > 0) - { - // Forward iteration - stop = arm_compute::utility::clamp(stop, 0, axisSize); - } - else - { - // Backward iteration - stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); - } - - return stop; -} - -inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) -{ - int32_t offset = b * shape[2] * shape[1] * shape[0]; - offset += d * shape[1] * shape[0]; - offset += h * shape[0]; - offset += w; - return offset; -} - -void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) -{ - auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>(); - k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask); - _kernel = std::move(k); -} - -void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) -{ - ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate( - input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), - beginMask, endMask, shrinkAxisMask)); - - _input = input; - _output = output; - _beginData = beginData; - _endData = endData; - _stridesData = stridesData; - _beginMask = beginMask; - _endMask = endMask; - _shrinkAxisMask = shrinkAxisMask; -} - -void CLStridedSliceCPU::run() -{ - run_on_cpu(); - - arm_compute::CLScheduler::get().sync(); -} - -inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) -{ - if (stride > 0) - { - return ((stop - start - 1) / stride) + 1; - } - else - { - return ((stop - start + 1) / stride) + 1; - } -} - -template <typename T> -inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask, - int32_t endMask, const std::vector<int32_t> &startIndices, - const std::vector<int32_t> &stopIndices, - const std::vector<int32_t> &strides, T *outputData) -{ - ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims); - ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims); - ARM_COMPUTE_ERROR_ON(strides.size() != maxDims); - - const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3); - const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3); - const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2); - const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2); - const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1); - const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1); - const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0); - const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0); - - // The shape of outputData may collapse in one-dimension. - // Therefore, it is necessary to create a shape that matches the result of the outputData. - TensorShape outputShape( - getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]), - getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3])); - for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b; - in_b += strides[3], b++) - { - for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d; - in_d += strides[2], d++) - { - for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h; - in_h += strides[1], h++) - { - for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w; - in_w += strides[0], w++) - { - outputData[offset4D(outputShape, b, d, h, w)] = - inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)]; - } - } - } - } -} - -void CLStridedSliceCPU::run_on_cpu() -{ - // TODO: Support shrinkAxisMask - cl::CommandQueue q = CLScheduler::get().queue(); - - _input->map(q); - _output->map(q); - _beginData->map(q); - _endData->map(q); - _stridesData->map(q); - - TensorShape inputShape = _input->info()->tensor_shape(); - TensorShape outputShape = _output->info()->tensor_shape(); - - std::vector<int32_t> starts; - std::vector<int32_t> stops; - std::vector<int32_t> strides; - - for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx) - { - starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]); - stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]); - strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]); - } - - for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++) - { - starts.emplace_back(0); - stops.emplace_back(1); - strides.emplace_back(1); - } - - switch (_input->info()->data_type()) - { - case DataType::U8: - case DataType::QASYMM8: - StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<uint8_t *>(_output->buffer())); - break; - case DataType::S8: - case DataType::QS8: - StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer())); - break; - case DataType::U16: - StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<uint16_t *>(_output->buffer())); - break; - case DataType::S16: - case DataType::QS16: - StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<int16_t *>(_output->buffer())); - break; - case DataType::F16: - // Not sure this works. - StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer())); - break; - case DataType::U32: - StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<uint32_t *>(_output->buffer())); - break; - case DataType::S32: - StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<int32_t *>(_output->buffer())); - break; - case DataType::F32: - StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer())); - break; - default: - ARM_COMPUTE_ERROR("DataType not supported"); - break; - } - - _input->unmap(q); - _output->unmap(q); - _beginData->unmap(q); - _endData->unmap(q); - _stridesData->unmap(q); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp new file mode 100644 index 000000000..be7353493 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLStridedSliceEx.h" + +#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" + +using namespace arm_compute; + +void CLStridedSliceEx::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) +{ + auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceExKernel>(); + k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp index 6426364c9..19177497c 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -15,12 +15,9 @@ * limitations under the License. */ #include "arm_compute/runtime/CL/functions/CLTopKV2.h" +#include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/CLHelpers.h" - -#include <vector> -#include <algorithm> #include "../../topk_v2.h" diff --git a/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp new file mode 100644 index 000000000..988e92715 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NENormalizationLayerEx::NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), + _border_handler(), _input_squared() +{ +} + +void NENormalizationLayerEx::configure(const ITensor *input, ITensor *output, + const NormalizationLayerInfo &norm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), + input->info()->quantization_info()); + _input_squared.allocator()->init(tensor_info); + + // Manage intermediate buffers + _memory_group.manage(&_input_squared); + + // Configure kernels + _norm_kernel.configure(input, &_input_squared, output, norm_info); + _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, + PixelValue(0.0f)); + + // Allocate the tensor once the configure methods have been called + _input_squared.allocator()->allocate(); +} + +Status NENormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) +{ + // Perform validation step + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_RETURN_ON_ERROR( + NENormalizationLayerExKernel::validate(input, input, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate( + input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + return Status{}; +} + +void NENormalizationLayerEx::run() +{ + _memory_group.acquire(); + + NEScheduler::get().schedule(&_multiply_kernel, Window::DimY); + NEScheduler::get().schedule(&_border_handler, Window::DimY); + NEScheduler::get().schedule(&_norm_kernel, Window::DimY); + + _memory_group.release(); +} diff --git a/libs/ARMComputeEx/src/runtime/topk_v2.h b/libs/ARMComputeEx/src/runtime/topk_v2.h index a18ff0b0d..f94effea1 100644 --- a/libs/ARMComputeEx/src/runtime/topk_v2.h +++ b/libs/ARMComputeEx/src/runtime/topk_v2.h @@ -15,6 +15,12 @@ * limitations under the License. */ +/** + * @file topk_v2.h + * @brief This file contains TopK method and TopContainer class for TopK operation + * @ingroup COM_AI_RUNTIME + */ + #ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ #define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ @@ -26,34 +32,62 @@ namespace rt { namespace optimized_ops { -// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. -// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than -// TFLite. -//(TFLite additionaly supports kTfLiteInt64.) - -// The class that collects top indexes of k values. Based on template -// tensorflow::gtl::TopN<> but, for optimization, -// it re-uses the same container. +/** + * @brief class to define TopK operation + * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. + * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than + * TFLite. + * (TFLite additionaly supports kTfLiteInt64.) + * + * The class that collects top indexes of k values. Based on template + * tensorflow::gtl::TopN<> but, for optimization, + * it re-uses the same container. + */ template <typename T> class TopContainer { public: + /** + * @brief Prevent default constructor of of this class + */ TopContainer() = delete; + /** + * @brief Constructor with params + * @param [in] row_size Size of row in data + * @param [in] k The top k predictions + */ TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) { container_.reserve(std::min(k, row_size) + 1); } - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + */ TopContainer(const TopContainer &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /* + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + * @return Reference of TopContainer + */ TopContainer &operator=(const TopContainer &) = delete; + /** + * @brief Start collecting + * @param [in] values To set as values + * @return N/A + */ void start_collecting(const T *values) { values_ = values; container_.clear(); } + /** + * @brief Push a value to be compared for topk + * @param [in] a A value to compare + * @return N/A + */ void push(int32 a) { auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; @@ -74,6 +108,10 @@ public: } } + /** + * @brief Get sorted result from pushed values + * @return Reference of vector with sorted values + */ const std::vector<int32> &sorted_result() { auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; @@ -111,6 +149,16 @@ private: } }; +/** + * @brief Operates TopK operation with params + * @param [in] row_size Size of row in data + * @param [in] num_rows The number of rows in data + * @param [in] data To be operated in + * @param [in] k The top k predictions + * @param [out] output_indexes Indexes of targets in the top k predictions + * @param [out] output_values Values of targets in the top k predictions + * @return N/A + */ template <typename T> void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, T *output_values) |