diff options
Diffstat (limited to 'libs')
248 files changed, 17663 insertions, 4105 deletions
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h index 026487077..e4e752ef9 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -14,6 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLKernelLibraryEx.h + * @ingroup COM_AI_RUNTIME + * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines + * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL. + */ + #ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ #define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ @@ -27,58 +35,76 @@ namespace arm_compute { -/** CLKernelLibrary class */ +/** + * @brief Class to build OpenCL kernels added from nnfw + * */ class CLKernelLibraryEx { using StringSet = std::set<std::string>; private: - /** Default Constructor. */ + /** + * @brief Construct a new CLKernelLibraryEx object + */ CLKernelLibraryEx(); public: - /** Prevent instances of this class from being copied */ + /** + * @brief Prevent instances of this class from being copied. + */ CLKernelLibraryEx(const CLKernelLibraryEx &) = delete; - /** Prevent instances of this class from being copied */ + + /** + * @brief Prevent instances of this class from being copied. + */ const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete; - /** Access the KernelLibrary singleton. - * @return The KernelLibrary instance. + + /** + * @brief Get the KernelLibrary singleton. + * @return The KernelLibrary instance */ static CLKernelLibraryEx &get(); - /** Initialises the kernel library. - * - * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded. - * @param[in] context (Optional) CL context used to create programs. - * @param[in] device (Optional) CL device for which the programs are created. - */ - void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), - cl::Device device = cl::Device::getDefault()) + + /** + * @brief Initialise the kernel library. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @param[in] context CL context used to create programs. + * @param[in] device CL device for which the programs are created. + * @return N/A + */ + void init(std::string kernel_path, cl::Context context, cl::Device device) { _kernel_path = std::move(kernel_path); _context = std::move(context); _device = std::move(device); } - /** Sets the path that the kernels reside in. - * - * @param[in] kernel_path Path of the kernel. + + /** + * @brief Set the path that the kernels reside in. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @return N/A */ void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; }; - /** Gets the path that the kernels reside in. + + /** + * @brief Get the path that the kernels reside in. + * @return the path of kernel files */ std::string get_kernel_path() { return _kernel_path; }; - /** Gets the source of the selected program. - * + + /** + * @brief Get the source of the selected program. * @param[in] program_name Program name. - * * @return Source of the selected program. */ std::string get_program_source(const std::string &program_name); - /** Sets the CL context used to create programs. - * + + /** + * @brief Set the CL context used to create programs. * @note Setting the context also resets the device to the * first one available in the new context. - * * @param[in] context A CL context. + * @return N/A */ void set_context(cl::Context context) { @@ -102,42 +128,56 @@ public: } } - /** Accessor for the associated CL context. - * + /** + * @brief Return associated CL context. * @return A CL context. */ cl::Context &context() { return _context; } - /** Sets the CL device for which the programs are created. - * + /** + * @brief Set the CL device for which the programs are created. * @param[in] device A CL device. + * @return N/A */ void set_device(cl::Device device) { _device = std::move(device); } - /** Return the device version - * + /** + * @brief Gets the CL device for which the programs are created. + * @return A CL device. + */ + cl::Device &get_device() { return _device; } + + /** + * @brief Return the device version * @return The content of CL_DEVICE_VERSION */ std::string get_device_version(); - /** Creates a kernel from the kernel library. - * + + /** + * @brief Create a kernel from the kernel library. * @param[in] kernel_name Kernel name. * @param[in] build_options_set Kernel build options as a set. - * * @return The created kernel. */ Kernel create_kernel(const std::string &kernel_name, const StringSet &build_options_set = {}) const; - /** Find the maximum number of local work items in a workgroup can be supported for the kernel. - * + + /** + * @brief Find the maximum number of local work items in a workgroup can be supported for the + * kernel. + * @param[in] kernel kernel object */ + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; - /** Return the default NDRange for the device. - * + /** + * @brief Return the default NDRange for the device. + * @return default NDRangeof the device */ cl::NDRange default_ndrange() const; - /** Clear the library's cache of binary programs + /** + * @brief Clear the library's cache of binary programs + * @return N/A */ void clear_programs_cache() { @@ -145,29 +185,45 @@ public: _built_programs_map.clear(); } - /** Access the cache of built OpenCL programs */ + /** + * @brief Access the cache of built OpenCL programs + * @return program map data structure of which key is name of kernel and value is + * kerel source name. (*.cl) + */ const std::map<std::string, cl::Program> &get_built_programs() const { return _built_programs_map; } - /** Add a new built program to the cache - * + /** + * @brief Add a new built program to the cache * @param[in] built_program_name Name of the program * @param[in] program Built program to add to the cache + * @return N/A */ void add_built_program(const std::string &built_program_name, cl::Program program); + /** + * @brief Returns true if FP16 is supported by the CL device + * @return true if the CL device supports FP16 + */ + bool fp16_supported() const; + + /** + * @brief Returns true if int64_base_atomics extension is supported by the CL device + * @return true if the CL device supports int64_base_atomics extension + */ + bool int64_base_atomics_supported() const; + private: - /** Load program and its dependencies. - * + /** + * @brief Load program and its dependencies. * @param[in] program_name Name of the program to load. */ const Program &load_program(const std::string &program_name) const; - /** Concatenates contents of a set into a single string. - * + /** + * @brief Concatenates contents of a set into a single string. * @param[in] s Input set to concatenate. - * * @return Concatenated string. */ std::string stringify_set(const StringSet &s) const; diff --git a/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h new file mode 100644 index 000000000..dbda354d6 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_OPENCLEX_H__ +#define __ARM_COMPUTE_OPENCLEX_H__ + +#include <string> +#include <utility> + +/* Configure the Khronos C++ wrapper to target OpenCL 1.2: */ +#ifndef ARM_COMPUTE_NO_EXCEPTIONS +#define CL_HPP_ENABLE_EXCEPTIONS +#endif // ARM_COMPUTE_NO_EXCEPTIONS +#define CL_HPP_CL_1_2_DEFAULT_BUILD +#define CL_HPP_TARGET_OPENCL_VERSION 110 +#define CL_HPP_MINIMUM_OPENCL_VERSION 110 +#include <CL/cl2.hpp> + +namespace arm_compute +{ +/** Class for loading OpenCL symbols. */ +class CLSymbolsEx final +{ +private: + CLSymbolsEx() = default; + void load_symbols(void *handle); + +public: + /** Get the static instance of CLSymbols. + * + * @return The static instance of CLSymbols. + */ + static CLSymbolsEx &get(); + /** Load symbols from the given OpenCL library path. + * + * @param[in] library Path to the OpenCL library. + * + * @return True if loading the library is successful. + */ + bool load(const std::string &library); + /** Load symbols from any of the default OpenCL library names. + * + * @return True if loading any library is successful. + */ + bool load_default(); + +#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr + + DECLARE_FUNCTION_PTR(clGetEventInfo); + DECLARE_FUNCTION_PTR(clSetEventCallback); + +#undef DECLARE_FUNCTION_PTR + +private: + std::pair<bool, bool> _loaded{false, false}; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_OPENCLEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h new file mode 100644 index 000000000..080cc47ef --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ +#define __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the activation layer kernel. */ +class CLActivationLayerExKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLActivationLayerExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLActivationLayerExKernel(const CLActivationLayerExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLActivationLayerExKernel &operator=(const CLActivationLayerExKernel &) = delete; + /** Allow instances of this class to be moved */ + CLActivationLayerExKernel(CLActivationLayerExKernel &&) = default; + /** Allow instances of this class to be moved */ + CLActivationLayerExKernel &operator=(CLActivationLayerExKernel &&) = default; + /** Default destructor */ + ~CLActivationLayerExKernel() = default; + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will + * store the result + * of the activation function. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + */ + void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLActivationLayerKernel + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor + * will store the result + * of the activation function. Data types supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; + bool _run_in_place; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h new file mode 100644 index 000000000..b91a26159 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLArgMinMaxKernel.h + * @brief This file defines CLArgMinMaxKernel + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__ +#define __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the argminmax max kernel. + */ +class CLArgMinMaxKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor. + */ + CLArgMinMaxKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied + */ + CLArgMinMaxKernel(const CLArgMinMaxKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied + * @return Reference of this instance + */ + CLArgMinMaxKernel &operator=(const CLArgMinMaxKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved + */ + CLArgMinMaxKernel(CLArgMinMaxKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved + * @return Reference of this instance + */ + CLArgMinMaxKernel &operator=(CLArgMinMaxKernel &&) = default; + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] argminmax_axis Axis to argminmax + * return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t argminmax_axis, + ArgOperation op); + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLArgMinMaxKernel + * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] argminmax_axis Axis to argminmax + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t argminmax_axis, ArgOperation op); + + /* + * @brief Run CLArgMinMaxKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + /* + * @brief Run CLArgMinMaxKernel op on CPU + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run_on_cpu(cl::CommandQueue &queue); + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _argminmax_axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLargminmaxMAXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h new file mode 100644 index 000000000..9a765f310 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ +#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the arithmetic subtraction kernel (support broadcasting) + * + * Arithmetic subtraction is computed by: + * @f[ output(x,y) = input1(x,y) - input2(x,y) @f] + */ +class CLArithmeticSubtractionExKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLArithmeticSubtractionExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticSubtractionExKernel(const CLArithmeticSubtractionExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticSubtractionExKernel &operator=(const CLArithmeticSubtractionExKernel &) = delete; + /** Allow instances of this class to be moved */ + CLArithmeticSubtractionExKernel(CLArithmeticSubtractionExKernel &&) = default; + /** Allow instances of this class to be moved */ + CLArithmeticSubtractionExKernel &operator=(CLArithmeticSubtractionExKernel &&) = default; + /** Default destructor */ + ~CLArithmeticSubtractionExKernel() = default; + + /** Initialise the kernel's inputs, output and convertion policy. + * + * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32. + * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), + * S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArithmeticSubtractionExKernel + * + * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), + * S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; /**< Source tensor 1 */ + const ICLTensor *_input2; /**< Source tensor 2 */ + ICLTensor *_output; /**< Destination tensor */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h new file mode 100644 index 000000000..1387897c9 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__ +#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform BATCH_TO_SPACE_ND operation */ +class CLBatchToSpaceNDKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLBatchToSpaceNDKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLBatchToSpaceNDKernel(const CLBatchToSpaceNDKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLBatchToSpaceNDKernel &operator=(const CLBatchToSpaceNDKernel &) = delete; + /** Allow instances of this class to be moved */ + CLBatchToSpaceNDKernel(CLBatchToSpaceNDKernel &&) = default; + /** Allow instances of this class to be moved */ + CLBatchToSpaceNDKernel &operator=(CLBatchToSpaceNDKernel &&) = default; + /** Default destructor */ + ~CLBatchToSpaceNDKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t *block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h new file mode 100644 index 000000000..ab33d9d3a --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/ +class CLBinaryLogicalOpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLBinaryLogicalOpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h index 6bd33bf8f..4c2feb903 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h @@ -14,6 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLCastKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLCastKernel class + */ + #ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ #define __ARM_COMPUTE_CLCASTKERNEL_H__ @@ -23,30 +30,62 @@ namespace arm_compute { class ICLTensor; -/** OpenCL kernel to perform a cast operation */ +/** + * @brief Class to define OpenCL kernel for cast operation + */ class CLCastKernel : public ICLKernel { public: - /** Default constructor */ + /** + * @brief Construct CLCastKernel object + */ CLCastKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ CLCastKernel(const CLCastKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ CLCastKernel &operator=(const CLCastKernel &) = delete; - /** Allow instances of this class to be moved */ + + /** + * @brief Construct CLCastKernel object using default move constructor + * @param[in] CLCastKernel object to move + */ CLCastKernel(CLCastKernel &&) = default; - /** Allow instances of this class to be moved */ + + /** + * @brief Allow instances of this class to be moved + * @param[in] CLCastKernel object to move + */ CLCastKernel &operator=(CLCastKernel &&) = default; - /** Default destructor */ + + /** + * @brief Destruct this CLCastKernel object + */ ~CLCastKernel() = default; - /** Initialise the kernel's input and output. - * + + /** + * @brief Initialise the kernel's input and output. * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @return N/A */ void configure(const ICLTensor *input, ICLTensor *output); - // Inherited methods overridden: + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h new file mode 100644 index 000000000..f5f455993 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ +#define __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to check if values in both tensors are equal*/ +class CLComparisonOpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLComparisonOpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLComparisonOpKernel(const CLComparisonOpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLComparisonOpKernel &operator=(const CLComparisonOpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLComparisonOpKernel(CLComparisonOpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLComparisonOpKernel &operator=(CLComparisonOpKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + const ComparisonOperation &op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h new file mode 100644 index 000000000..60ec7a82a --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ +#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform depthTospace operation */ +class CLDepthToSpaceKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLDepthToSpaceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete; + /** Allow instances of this class to be moved */ + CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default; + /** Allow instances of this class to be moved */ + CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default; + /** Default destructor */ + ~CLDepthToSpaceKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h new file mode 100644 index 000000000..da075db69 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLEmbeddingLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLEmbeddingLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform EmbeddingLookup operation with opencl kernel +*/ +class CLEmbeddingLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLEmbeddingLookupKernel object + * */ + CLEmbeddingLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLEmbeddingLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] input Source tensor. + * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] lookups Lookups are 1D tensor that values are indices into the first + * dimension of input. + * Data types supported: S32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLEmbeddingLookupKernel + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] lookups Lookups info. Data types supported: S32. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + const ICLTensor *_lookups; /** Lookups tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h new file mode 100644 index 000000000..a6ea539f8 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLEXPKERNEL_H__ +#define __ARM_COMPUTE_CLEXPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform an exponential operation */ +class CLExpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLExpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLExpKernel(const CLExpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLExpKernel &operator=(const CLExpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLExpKernel(CLExpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLExpKernel &operator=(CLExpKernel &&) = default; + /** Default destructor */ + ~CLExpKernel() = default; + /** Set the source, destination of the kernel + * + * @param[in] input Source tensor. Data type supported: F32. + * @param[out] output Destination tensor. Data type supported: F32. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLEXPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h index a51441aca..7e35a80b0 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h @@ -14,52 +14,85 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLGatherKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLGatherKernel class + */ + #ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__ #define __ARM_COMPUTE_CLGATHERKERNEL_H__ #include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" namespace arm_compute { class ICLTensor; -/** Interface for the gather kernel. - * +/** + * @brief Class to define an interface for the gather kernel. */ class CLGatherKernel : public ICLKernel { public: - /** Default constructor.*/ + /** + * @brief Construct CLGatherKernel object + * */ CLGatherKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ CLGatherKernel(const CLGatherKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ CLGatherKernel &operator=(const CLGatherKernel &) = delete; - /** Allow instances of this class to be moved */ + + /** + * @brief Construct CLGatherKernel object by using default move constructor + * @param[in] CLGatherKernel object to move + */ CLGatherKernel(CLGatherKernel &&) = default; - /** Allow instances of this class to be moved */ + + /** + * @brief Move assignment operator + * @param[in] CLGatherKernel object to move + */ CLGatherKernel &operator=(CLGatherKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * + + /** + * @brief Initialise the kernel's input, output and border mode. * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. * @param[in] input2 An input tensor. Data types supported: S32. * @param[out] output The output tensor, Data types supported: same as @p input1. + * @return N/A */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref * CLGatherKernel - * * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. * @param[in] input2 An input tensor. Data types supported: S32. * @param[out] output The output tensor, Data types supported: same as @p input1. - * * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output); - // Inherited methods overridden: + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h new file mode 100644 index 000000000..c3fc15637 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLHashtableLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLHashtableLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform HashtableLookup operation with opencl kernel +*/ +class CLHashtableLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLHashtableLookupKernel object + * */ + CLHashtableLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Construct a CLHashtableLookupKernel object by using default move constructor + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLHashtableLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, + ICLTensor *output, ICLTensor *hits); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLHashtableLookupKernel + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_lookups; /** Lookups tensor */ + const ICLTensor *_keys; /** Keys tensor */ + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + ICLTensor *_hits; /** Hits tensor */ + std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h new file mode 100644 index 000000000..ccbea147e --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ +#define __ARM_COMPUTE_CLNEGKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform a negation operation on tensor*/ +class CLNegKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLNegKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel(const CLNegKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel &operator=(const CLNegKernel &) = delete; + /** Allow instances of this class to be moved */ + CLNegKernel(CLNegKernel &&) = default; + /** Allow instances of this class to be moved */ + CLNegKernel &operator=(CLNegKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h new file mode 100644 index 000000000..181a6226a --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ +#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the normalization layer kernel. + */ +class CLNormalizationLayerExKernel : public ICLKernel +{ +public: + /** Constructor */ + CLNormalizationLayerExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLNormalizationLayerExKernel(const CLNormalizationLayerExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLNormalizationLayerExKernel &operator=(const CLNormalizationLayerExKernel &) = delete; + /** Default Move Constructor. */ + CLNormalizationLayerExKernel(CLNormalizationLayerExKernel &&) = default; + /** Default move assignment operator */ + CLNormalizationLayerExKernel &operator=(CLNormalizationLayerExKernel &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: + * F16/F32. + * @param[out] output Destination tensor. Output will have the same number of dimensions as + * input. Data types supported: same as @p input. + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + */ + void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLNormalizationLayerKernel + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: + * F16/F32. + * @param[in] output Destination tensor. Output will have the same number of dimensions as + * input. Data types supported: same as @p input. + * @param[in] norm_info Normalization layer information like the normalization type, normalization + * size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + NormalizationLayerInfo norm_info); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + BorderSize _border_size; + bool _is_in_map; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h new file mode 100644 index 000000000..eff1b8bd5 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__ +#define __ARM_COMPUTE_CLPRELU_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to calculate PReLU*/ +class CLPReLUKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLPReLUKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPReLUKernel(const CLPReLUKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPReLUKernel &operator=(const CLPReLUKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPReLUKernel(CLPReLUKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPReLUKernel &operator=(CLPReLUKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor1. + * @param[in] alpha Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input; + const ICLTensor *_alpha; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h new file mode 100644 index 000000000..cbaa2adee --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +* Copyright (c) 2016-2018 ARM Limited. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#ifndef __ARM_COMPUTE_CLPADLAYERKERNEL_H__ +#define __ARM_COMPUTE_CLPADLAYERKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform PAD operation */ +class CLPadLayerKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLPadLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerKernel(const CLPadLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPadLayerKernel(CLPadLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default; + /** Default destructor */ + ~CLPadLayerKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] pad_size Padding Size tensor. Data types supported : S32 + */ + void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ + ICLTensor *_pad_size; /**< Padding Size tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLPADLAYERKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h new file mode 100644 index 000000000..3434deee8 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ +#define __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform tensor permutation. + * + * Permutes given a permutation vector + */ +class CLPermuteExKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLPermuteExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPermuteExKernel(const CLPermuteExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPermuteExKernel &operator=(const CLPermuteExKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPermuteExKernel(CLPermuteExKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPermuteExKernel &operator=(CLPermuteExKernel &&) = default; + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to permute. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + */ + void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPermuteKernel + * + * @param[in] input First tensor input info. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: same as @p input. + * @param[in] perm Permutation vector + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + PermutationVector _perm; +}; +} // arm_compute +#endif /*__ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h index cd2b255bc..d579f5d8f 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h @@ -14,68 +14,106 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLPixelWiseDivisionKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLPixelWiseDivisionKernel class + */ + #ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ #define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ #include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" namespace arm_compute { class ICLTensor; -/** Interface for the pixelwise division kernel. - * +/** + * @brief Interface for the pixelwise division kernel. */ class CLPixelWiseDivisionKernel : public ICLKernel { public: - /** Default constructor.*/ + /** + * @brief Construct a CLPixelWiseDivisionKernel object + */ CLPixelWiseDivisionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete; - /** Allow instances of this class to be moved */ + + /** + * @brief Construct a CLPixelWiseDivisionKernel object by using move constructor + * @param[in] CLPixelWiseDivisionKernel object to move + */ CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default; - /** Allow instances of this class to be moved */ + + /** + * @brief Allow instances of this class to be moved + * @param[in] CLPixelWiseDivisionKernel object to move + */ CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32. * @param[in] input2 An input tensor. Data types supported: same as @p input1. * @param[out] output The output tensor, Data types supported: same as @p input1. Note: - * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * U8 requires both inputs to be U8. * @param[in] scale Scale to apply after division. * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest * even. + * @return N/A */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - /** Static function to check if given info will lead to a valid configuration of @ref + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref * CLPixelWiseDivisionKernel - * - * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32. * @param[in] input2 An input tensor info. Data types supported: same as @p input1. * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * Note: U8 requires both inputs to be U8. * @param[in] scale Scale to apply after division. * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - // Inherited methods overridden: + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; + + /** + * @brief The size of the border for that kernel + * @return The width in number of elements of the border. + */ BorderSize border_size() const override; private: diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h deleted file mode 100644 index a7d96cc5c..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ -#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the pixelwise division kernel. - * - */ -class CLReduceMaxKernel : public ICLKernel -{ -public: - /** Default constructor.*/ - CLReduceMaxKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLReduceMaxKernel(const CLReduceMaxKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete; - /** Allow instances of this class to be moved */ - CLReduceMaxKernel(CLReduceMaxKernel &&) = default; - /** Allow instances of this class to be moved */ - CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. - * @param[in] axis Axis to reduce - * @param[out] output The output tensor, Data types supported: same as @p input1. Note: - * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). - */ - void configure(const ICLTensor *input, int32_t axis, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLReduceMaxKernel - * - * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. - * @param[in] axis Axis to reduce - * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). - * - * @return a status - */ - static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - void run_on_cpu(cl::CommandQueue &queue); - -private: - const ICLTensor *_input; - ICLTensor *_output; - int32_t _axis; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h new file mode 100644 index 000000000..a26a4a7fc --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLReduceOperationKernel.h + * @brief This file defines CLReduceOperationKernel class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the reduce operation kernel + */ +class CLReduceOperationKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor + */ + CLReduceOperationKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel(const CLReduceOperationKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel(CLReduceOperationKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default; + /** + * @brief Default destructor + */ + ~CLReduceOperationKernel() = default; + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ReduceOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperationKernel. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32. + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op); + + /* + * @brief Run CLReduceOperationKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue CLQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h deleted file mode 100644 index de9df3381..000000000 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ -#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the reduction operation kernel */ -class CLReductionMeanKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLReductionMeanKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLReductionMeanKernel(const CLReductionMeanKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete; - /** Allow instances of this class to be moved */ - CLReductionMeanKernel(CLReductionMeanKernel &&) = default; - /** Allow instances of this class to be moved */ - CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default; - /** Default destructor */ - ~CLReductionMeanKernel() = default; - - /** Set the input and output tensors. - * - * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1 - */ - void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis); - - /** Static function to check if given info will lead to a valid configuration of @ref - * CLReductionMeanKernel. - * - * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. - * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p - * input. - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1 - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - std::vector<uint32_t> _reduction_axis; - BorderSize _border_size; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h new file mode 100644 index 000000000..68534f1ab --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ +#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */ +class CLSpaceToBatchNDKernel final : public ICLKernel +{ +public: + /** Default constructor */ + CLSpaceToBatchNDKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default; + /** Default destructor */ + ~CLSpaceToBatchNDKernel() = default; + /** Initialise the kernel's input and output. + * + * @note The data layout of input and output must be the same. + * @note The number of dimensions of input and output must be 4, and `spatial` dimensions + * are height and width. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + * @param[in] block_size Block size tensor. Data types supported: S32. + * @param[in] padding_size Padding size tensor. Data types supported: S32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + */ + void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, + ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + const ICLTensor *_block_size; /**< Block size tensor */ + const ICLTensor *_padding_size; /**< Padding size tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h new file mode 100644 index 000000000..be845a549 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ +#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform spaceTodepth operation */ +class CLSpaceToDepthKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLSpaceToDepthKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default; + /** Default destructor */ + ~CLSpaceToDepthKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h new file mode 100644 index 000000000..a4c44e35d --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ +#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to return squared difference value of two tensors (x-y)^2*/ +class CLSquaredDifferenceKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLSquaredDifferenceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLSquaredDifferenceKernel(const CLSquaredDifferenceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLSquaredDifferenceKernel &operator=(const CLSquaredDifferenceKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSquaredDifferenceKernel(CLSquaredDifferenceKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSquaredDifferenceKernel &operator=(CLSquaredDifferenceKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h index 248ae6635..6368c380e 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h @@ -14,36 +14,64 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ -#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ + +/** + * @file CLStridedSliceExKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLStridedSliceExKernel class + */ + +#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ +#define __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ #include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" namespace arm_compute { class ICLTensor; -/** Interface for the kernel to extract a strided slice of a tensor */ -class CLStridedSliceKernel : public ICLKernel +/** +* @brief Class to define an interface for the kernel to extract a strided slice of a tensor +*/ +class CLStridedSliceExKernel : public ICLKernel { public: - /** Default constructor */ - CLStridedSliceKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLStridedSliceKernel(const CLStridedSliceKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete; - /** Allow instances of this class to be moved */ - CLStridedSliceKernel(CLStridedSliceKernel &&) = default; - /** Allow instances of this class to be moved */ - CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default; - /** Default destructor */ - ~CLStridedSliceKernel() = default; - /** Set the input and output of the kernel - * + /** + * @brief Construct a CLStridedSliceExKernel object + * */ + CLStridedSliceExKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLStridedSliceExKernel(const CLStridedSliceExKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLStridedSliceExKernel &operator=(const CLStridedSliceExKernel &) = delete; + + /** + * @brief Construct a CLStridedSliceExKernel object by using default move constructor + * @param[in] CLStridedSliceExKernel object to move + * */ + CLStridedSliceExKernel(CLStridedSliceExKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLStridedSliceExKernel object to move + * */ + CLStridedSliceExKernel &operator=(CLStridedSliceExKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLStridedSliceExKernel() = default; + + /** + * @brief Set the input and output of the kernel * @param[in] input Source tensor. Data type supported: - * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] beginData The begin tensor. Data types supported: S32. * The number of dimensions must be 1. @@ -57,17 +85,17 @@ public: * @param[in] beginMask Mask for begin * @param[in] endMask Mask for end * @param[in] shrinkAxisMask Mask for shrink axis. - * + * @return N/A */ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLStridedSliceKernel - * + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLStridedSliceExKernel * @param[in] input The input tensor info. Data types supported: - * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 * @param[in] output The output tensor info, Data types supported: same as @p input1. * @param[in] begin The begin tensor info. Data types supported: S32. * The number of dimensions must be 1. @@ -81,7 +109,6 @@ public: * @param[in] beginMask Mask for begin * @param[in] endMask Mask for end * @param[in] shrinkAxisMask Mask for shrink axis. - * * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, @@ -89,7 +116,16 @@ public: const ITensorInfo *stride, int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask); - // Inherited methods overridden: + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -103,4 +139,4 @@ private: int32_t _shrinkAxisMask; /** Shrink axis mask */ }; } // namespace arm_compute -#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */ +#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h index 5c567f38e..eb2bad254 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -14,14 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLTopKV2Kernel.h + * @brief This file defines classes for TopKV2Kernel + * @ingroup COM_AI_RUNTIME + */ + #ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ #define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ -#include "arm_compute/core/CL/ICLArray.h" #include "arm_compute/core/CL/ICLKernel.h" -#include <array> - // these parameters can be changed #define _ITEMS 16 // number of items in a group #define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS @@ -33,24 +37,59 @@ namespace arm_compute { class ICLTensor; +/** + * @brief Class to define CLTopKV2Single + */ class CLTopKV2Single : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2Single(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + */ CLTopKV2Single(const CLTopKV2Single &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + * @return Reference of this instance + */ CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + */ CLTopKV2Single(CLTopKV2Single &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + * @return Reference of this instance + */ CLTopKV2Single &operator=(CLTopKV2Single &&) = default; + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] topk_values Values of the top k predictions + * @param[in] topk_indices Indices of the top k predictions + * @param[in] indices Indices + * @param[in] temp_stack Temp stack + * @param[in] k K of the top k predictions + * @param[in] n Number times to quick-sort + * return N/A + */ void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n); - // Inherited methods overridden: + /* + * @brief Run CLTopKV2Single op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -59,52 +98,121 @@ private: ICLTensor *_topk_indices; }; +/** + * @brief Class to define CLTopKV2Init + */ class CLTopKV2Init : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2Init(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + */ CLTopKV2Init(const CLTopKV2Init &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + * @return Reference of this instance + */ CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + */ CLTopKV2Init(CLTopKV2Init &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + * @return Reference of this instance + */ CLTopKV2Init &operator=(CLTopKV2Init &&) = default; + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] in_key_buf Buffer of input key + * @param[in] in_ind_buf Buffer of input index + * @param[in] n Number times to quick-sort + * return N/A + */ void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n); - // Inherited methods overridden: + /* + * @brief Run CLTopKV2Init op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: ICLTensor *_input; }; +/** + * @brief Class to define CLRadixSortHistogram + */ class CLRadixSortHistogram : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortHistogram(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + */ CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + * @return Reference of this instance + */ CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + */ CLRadixSortHistogram(CLRadixSortHistogram &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + * @return Reference of this instance + */ CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ void configure(cl::Buffer *hist_buf, int bits, int n); + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * return N/A + */ void setPass(int pass, cl::Buffer *in_key_buf) { _pass = pass; _in_key_buf = in_key_buf; } - // Inherited methods overridden: + /* + * @brief Run CLRadixSortHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -112,82 +220,210 @@ private: cl::Buffer *_in_key_buf; }; +/** + * @brief Class to define CLRadixSortScanHistogram + */ class CLRadixSortScanHistogram : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortScanHistogram(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + */ CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + * @return Reference of this instance + */ CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + */ CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + * @return Reference of this instance + */ CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); - // Inherited methods overridden: + /* + * @brief Run CLRadixSortScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; }; +/** + * @brief Class to define CLRadixSortGlobalScanHistogram + */ class CLRadixSortGlobalScanHistogram : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortGlobalScanHistogram(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + */ CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + * @return Reference of this instance + */ CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + */ CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + * @return Reference of this instance + */ CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] glob_sum_buf Buffer of global sum + * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits); - // Inherited methods overridden: + /* + * @brief Run CLRadixSortGlobalScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; }; +/** + * @brief Class to define CLRadixSortPasteHistogram + */ class CLRadixSortPasteHistogram : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortPasteHistogram(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + */ CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + * @return Reference of this instance + */ CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + */ CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + * @return Reference of this instance + */ CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); - // Inherited methods overridden: + /* + * @brief Run CLRadixSortPasteHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; }; +/** + * @brief Class to define CLRadixSortReorder + */ class CLRadixSortReorder : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLRadixSortReorder(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + */ CLRadixSortReorder(const CLRadixSortReorder &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + * @return Reference of this instance + */ CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + */ CLRadixSortReorder(CLRadixSortReorder &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + * @return Reference of this instance + */ CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ void configure(cl::Buffer *hist_buf, int bits, int n); + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, cl::Buffer *out_ind_buf) { @@ -197,7 +433,12 @@ public: _in_ind_buf = in_ind_buf; _out_ind_buf = out_ind_buf; } - // Inherited methods overridden: + /* + * @brief Run CLRadixSortReorder op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -208,47 +449,115 @@ private: cl::Buffer *_out_ind_buf; }; +/** + * @brief Class to define CLTopKV2FindFirstNegative + */ class CLTopKV2FindFirstNegative : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2FindFirstNegative(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + */ CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + * @return Reference of this instance + */ CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + */ CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + * @return Reference of this instance + */ CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ void configure(cl::Buffer *first_negative_idx_buf, int n); + /** + * @brief Set output buffer + * @param[out] out_key_buf Buffer of output key + * return N/A + */ void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; } - // Inherited methods overridden: + /* + * @brief Run CLTopKV2FindFirstNegative op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: cl::Buffer *_out_key_buf; }; +/** + * @brief Class to define CLTopKV2ReorderNegatives + */ class CLTopKV2ReorderNegatives : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2ReorderNegatives(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + */ CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + * @return Reference of this instance + */ CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + */ CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + * @return Reference of this instance + */ CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ void configure(cl::Buffer *first_negative_idx_buf, int n); + /** + * @brief Set buffers + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, cl::Buffer *out_ind_buf) { @@ -258,7 +567,12 @@ public: _out_ind_buf = out_ind_buf; } - // Inherited methods overridden: + /* + * @brief Run CLTopKV2ReorderNegatives op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: @@ -268,25 +582,63 @@ private: cl::Buffer *_out_ind_buf; }; +/** + * @brief Class to define CLTopKV2Store + */ class CLTopKV2Store : public ICLKernel { public: - /** Constructor */ + /** + * @brief Constructor + */ CLTopKV2Store(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + */ CLTopKV2Store(const CLTopKV2Store &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + * @return Reference of this instance + */ CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + */ CLTopKV2Store(CLTopKV2Store &&) = default; - /** Allow instances of this class to be moved */ + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + * @return Reference of this instance + */ CLTopKV2Store &operator=(CLTopKV2Store &&) = default; + /** + * @brief Initialise kernel with params + * @param[out] values Values tensor to store + * @param[out] indices Indices tensor to be used for store + * @param[in] k K of the top k predictions + * @param[in] n Number times to store + * return N/A + */ void configure(ICLTensor *values, ICLTensor *indices, int k, int n); + /** + * @brief Set buffers + * @param[out] out_key_buf Buffer of output key + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); - // Inherited methods overridden: + /* + * @brief Run CLTopKV2Store op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ void run(const Window &window, cl::CommandQueue &queue) override; private: diff --git a/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h new file mode 100644 index 000000000..f7bf72985 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ +#define __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the normalization layer kernel. + */ +class NENormalizationLayerExKernel : public INEKernel +{ +public: + const char *name() const override { return "NENormalizationLayerKernel"; } + /** Default constructor */ + NENormalizationLayerExKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENormalizationLayerExKernel(const NENormalizationLayerExKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENormalizationLayerExKernel &operator=(const NENormalizationLayerExKernel &) = delete; + /** Default Move Constructor. */ + NENormalizationLayerExKernel(NENormalizationLayerExKernel &&) = default; + /** Default move assignment operator */ + NENormalizationLayerExKernel &operator=(NENormalizationLayerExKernel &&) = default; + /** Default destructor */ + ~NENormalizationLayerExKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types + * supported: FP16/F32. + * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a + * single input with dimensions [width, height, IFM], + * Data type supported: same as @p input + * @param[out] output Destination tensor. Output will have the same number of dimensions as + * input. Data type supported: same as @p input + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + */ + void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, + NormalizationLayerInfo norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * NENormalizationLayerKernel + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types + * supported: FP16/F32. + * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a + * single input with dimensions [width, height, IFM], + * Data type supported: same as @p input + * @param[in] output Destination tensor. Output will have the same number of dimensions as + * input. Data type supported: same as @p input + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, + const ITensorInfo *output, NormalizationLayerInfo norm_info); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; + +private: + /** Function to perform normalization depending on the given template + * dimension. The second template parameter specifies whether the + * normalization has to be 1D or 2D. + * + * @note Only supported normalizations are: + * - 1D over X or Z + * - 2D over X and Y + * + * @param[in] window Region on which to execute the kernel. + */ + template <DataType dt, unsigned int dim, bool do_2D_norm> + void normalize_float(const Window &window); + + /** Common signature for all the specialised normalization functions + * + * @param[in] window Region on which to execute the kernel. + */ + using NormalizationFunctionEx = void (NENormalizationLayerExKernel::*)(const Window &window); + +private: + NormalizationFunctionEx _func; + const ITensor *_input; + const ITensor *_input_squared; + ITensor *_output; + NormalizationLayerInfo _norm_info; + BorderSize _border_size; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/TypesEx.h b/libs/ARMComputeEx/arm_compute/core/TypesEx.h new file mode 100644 index 000000000..8381f1cc6 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/TypesEx.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_TYPESEX_H__ +#define __ARM_COMPUTE_TYPESEX_H__ + +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <string> +#include <utility> + +namespace arm_compute +{ + +/** Available ArgIndex operations **/ +enum class ArgOperation +{ + MAX, + MIN, +}; + +/** Available reduce operations */ +enum class ReduceOperation +{ + MAX, /**< Max */ + MEAN, /**< Mean */ + SUM, /**< Sum */ + MIN, /**< Min */ +}; + +/** Available binary logical operations */ +enum class BinaryLogicalOperation +{ + AND, /**< AND */ + OR, /**< OR */ +}; + +enum class ComparisonOperation +{ + EQUAL, /**< EQUAL */ + NOT_EQUAL, /**< NOT_EQUAL */ +}; + +/** Activation Layer Information class */ +class ActivationLayerInfoEx +{ +public: + /** Available activation functions */ + enum class ActivationFunction + { + RSQRT /**< Inverse Square root ( \f$ f(x) = \rsqrt{x} \f$ )*/ + }; + + ActivationLayerInfoEx() = default; + /** Default Constructor + * + * @param[in] f The activation function to use. + * @param[in] a (Optional) The alpha parameter used by some activation functions + * (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU, + * @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH). + * @param[in] b (Optional) The beta parameter used by some activation functions (@ref + * ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref + * ActivationFunction::TANH). + */ + ActivationLayerInfoEx(ActivationFunction f, float a = 0.0f, float b = 0.0f) + : _act(f), _a(a), _b(b), _enabled(true) + { + } + /** Get the type of activation function */ + ActivationFunction activation() const { return _act; } + /** Get the alpha value */ + float a() const { return _a; } + /** Get the beta value */ + float b() const { return _b; } + /** Check if initialised */ + bool enabled() const { return _enabled; } + +private: + ActivationFunction _act = {ActivationLayerInfoEx::ActivationFunction::RSQRT}; + float _a = {}; + float _b = {}; + bool _enabled = {false}; +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_TYPESEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/UtilsEx.h b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h new file mode 100644 index 000000000..8dd68a0c3 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_UTILSEX_H__ +#define __ARM_COMPUTE_UTILSEX_H__ + +#include "arm_compute/core/TypesEx.h" + +#include <cstdint> +#include <cstdlib> +#include <sstream> +#include <string> + +namespace arm_compute +{ +/** Translates a given activation function to a string. + * + * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string. + * + * @return The string describing the activation function. + */ +const std::string &string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act); +} +#endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h new file mode 100644 index 000000000..7e578550f --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLActivationLayerExKernel + * + * @note The function simulates an activation layer with the specified activation function. + */ +class CLActivationLayerEx : public ICLSimpleFunction +{ +public: + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr or is equal to the input, the activation function will + * be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will + * store the result + * of the activation function. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] act_info Activation layer parameters. + */ + void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLActivationLayer + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor + * will store the result + * of the activation function. Data types supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info); +}; +} +#endif /* __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h new file mode 100644 index 000000000..8044c58af --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLArgMinMax.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLArgMinMax class + */ + +#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_H__ +#define __ARM_COMPUTE_CLARG_MIN_MAX_H__ + +#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to execute CLArgMinMax operation + */ +class CLArgMinMax : public IFunction +{ +public: + /** + * @brief Construct a new CLArgMinMax object + */ + CLArgMinMax(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLArgMinMax(const CLArgMinMax &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLArgMinMax &operator=(const CLArgMinMax &) = delete; + + /** + * @brief Construct a new CLArgMinMax object by using copy constructor + * @param[in] CLArgMinMax object to move + */ + CLArgMinMax(CLArgMinMax &&) = default; + + /** + * @brief Assign a CLArgMinMax object. + * @param[in] CLArgMinMax object to assign. This object will be moved. + */ + CLArgMinMax &operator=(CLArgMinMax &&) = default; + + /** + * @brief Initialise the kernel's inputs and outputs. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The result of argminmaxMax operation. Data types supported: same as @p + * input. + * @param[in] axis Axis to argminmax. It must be sorted and no duplicates. + * @param[in] is_min True for ArgMin operation. + * @param[in] is_max Ture for ArgMax operation. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> argminmax_axis, + ArgOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] axis Axis to argminmax + * @param[out] output The result of argminmaxMax operation. Data types supported: same as @p + * input. + * @return a status + */ + static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis, + const ITensorInfo *output, ArgOperation op); + + /** + * @brief Run the kernels contained in the function + * This operation works on CPU on GPU depending on the value of argminmax_MAX_RUN_ON_CPU macro + * in CLArgMinMax.cpp. + * If argminmax_MAX_RUN_ON_CPU == 1, CPU runs this operation. + * Otherwise GPU runs this operation. + * @return N/A + */ + void run() override; + +private: + ICLTensor *_input; + ICLTensor *_output; + std::vector<uint32_t> _argminmax_axis; + ArgOperation _arg_op; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLArgMinMaxKernel[]> _argminmax_kernels{nullptr}; + size_t _num_of_kernels; +}; +} +#endif /*__ARM_COMPUTE_CLargminmax_MAX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h new file mode 100644 index 000000000..34e6c6334 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ +#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLArithmeticSubtractionExKernel + * + * @note The tensor data type for the inputs must be U8/S16/F16/F32. + * @note The function performs an arithmetic subtraction between two tensors. + */ +class CLArithmeticSubtractionEx : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and convertion policy. + * + * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified + * inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified + * inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), + * S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArithmeticSubtractionEx + * + * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), + * S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy); +}; +} +#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h new file mode 100644 index 000000000..d16a0762d --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ +#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLBatchToSpaceNDKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLBatchToSpaceND : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] block_size A pointer to an array of integer values specifying block sizes + * for spatial dimension. + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h new file mode 100644 index 000000000..061e34f26 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLBinaryLogicalOp : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8. + * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8. + * @param[out] output Output tensor. Data types supported: U8, QASYMM8. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h index 63050067d..56b8408e2 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h @@ -14,30 +14,35 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLCast.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLCast class + */ + #ifndef __ARM_COMPUTE_CLCAST_H__ #define __ARM_COMPUTE_CLCAST_H__ -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { class ICLTensor; -/** Basic function to run @ref CLCastKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function converts the input tensor to the tensor of the output tensor's type. +/** + * @brief Class to run @ref CLCastKernel. + * This converts the input tensor to the tensor of the output tensor's type. */ class CLCast : public ICLSimpleFunction { public: - /** Initialise the kernel's input and output. - * - * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. - * The input tensor is [in, out] because its TensorInfo might be modified - * inside the kernel. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + /** + * @brief Initialise the kernel's input and output + * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. */ void configure(ICLTensor *input, ICLTensor *output); }; diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h new file mode 100644 index 000000000..1b0d70e7f --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_H__ +#define __ARM_COMPUTE_CLCOMPARISON_OP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLComparisonOp : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] input2 Source tensor2. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + const ComparisonOperation &op); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h new file mode 100644 index 000000000..d78a6ada4 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__ +#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLDepthToSpaceKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLDepthToSpace : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[block_size] block size integer only + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +}; +} // namesace arm_compute + +#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h new file mode 100644 index 000000000..257772a89 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class CLEmbeddingLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); +}; +} +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h new file mode 100644 index 000000000..2d0fc23a4 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLEXP_H__ +#define __ARM_COMPUTE_CLEXP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLExpKernel */ +class CLExp : public ICLSimpleFunction +{ +public: + /** Set the source, destination of the kernel + * + * @param[in] input Source tensor. Data type supported: F32. + * @param[out] output Destination tensor. Data type supported: F32. + */ + void configure(const ICLTensor *input, ICLTensor *output); +}; +} +#endif /* __ARM_COMPUTE_CLEXP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h index 3ae7afe14..f7fd3cda1 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h @@ -14,32 +14,43 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLGather.h + * @brief This file contains CLGather class + * @ingroup COM_AI_RUNTIME + */ + #ifndef __ARM_COMPUTE_CLGATHER_H__ #define __ARM_COMPUTE_CLGATHER_H__ -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { class ICLTensor; -/** Basic function to run @ref CLGatherKernel. */ +/** + * @brief Class to to run @ref CLGatherKernel. + */ class CLGather : public ICLSimpleFunction { public: - /** Initialise the kernel's inputs, output and convertion policy. - * - * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. - * @param[in] input2 An indexes tensor. Data types supported: S32. - * @param[out] output The output tensor, Data types supported: same as @p input1. - */ + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. + * @param[in] input2 An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @return N/A + */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref CLGather - * - * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. - * @param[in] input2 An indexes tensor. Data types supported: S32. - * @param[out] output The output tensor, Data types supported: same as @p input1. + + /** + * @brief Static function to check if given info will lead to a valid configuration + * of @ref CLGather + * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. + * @param[in] input2 An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h new file mode 100644 index 000000000..65aa6cbd5 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class CLHashtableLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, + ICLTensor *output, ICLTensor *hits); +}; +} +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h new file mode 100644 index 000000000..198a0fd4e --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNEG_H__ +#define __ARM_COMPUTE_CLNEG_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLNeg : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input Source tensor. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + * + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEG_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h new file mode 100644 index 000000000..4077245d5 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" +#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to compute a normalization layer. This function calls the following CL kernels: + * + * -# @ref CLFillBorderKernel + * -# @ref CLNormalizationLayerKernelEx + * + */ +class CLNormalizationLayerEx : public IFunction +{ +public: + /** Default constructor */ + CLNormalizationLayerEx(); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types + * supported: F16/F32 (Written to by the border handler) + * @param[out] output Destination tensor. Dimensions, data type and number of channels must + * match the input ones. + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + */ + void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLNormalizationLayer + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: + * F16/F32 + * @param[in] output Destination tensor. Dimensions, data type and number of channels must + * match the input ones. + * @param[in] norm_info Normalization layer information like the normalization type, normalization + * size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const NormalizationLayerInfo &norm_info); + + // Inherited methods overridden: + void run() override; + +private: + CLNormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel to run */ + CLFillBorderKernel _border_handler; /**< Kernel to handle borders */ +}; +} +#endif /* __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h new file mode 100644 index 000000000..622a61b5e --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPRELU_H__ +#define __ARM_COMPUTE_CLPRELU_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLPReLU : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input. Data types supported: + * QASYMM8/F16/F32. + * @param[in] alpha. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPRELU_H__*/ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h new file mode 100644 index 000000000..d6ea486d1 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h @@ -0,0 +1,47 @@ +/* +* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +* Copyright (c) 2016-2018 ARM Limited. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#ifndef __ARM_COMPUTE_CLPADLAYEREX_H__ +#define __ARM_COMPUTE_CLPADLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLPadLayerKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLPadLayerEx : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: + * U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: + * U8/QASYMM8/S16/S32/F16/F32. + * @param[in] pad_size Tensor for Padding values in NHWC format shape [n, 2], + * where n is the rank of tensor . Data types supported: S32 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLPADLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h new file mode 100644 index 000000000..9a0cc213c --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPERMUTEEX_H__ +#define __ARM_COMPUTE_CLPERMUTEEX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to execute an @ref CLPermuteKernel. */ +class CLPermuteEx : public ICLSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in] input The input tensor to permute. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + */ + void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration of @ref CLPermute. + * + * @param[in] input First tensor input info. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: same as @p input. + * @param[in] perm Permutation vector + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm); +}; +} +#endif /*__ARM_COMPUTE_CLPERMUTEEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h index c1383e21f..b142d3a2e 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h @@ -14,53 +14,61 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLPixelWiseDivision.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLPixelWiseDivision class + */ #ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ #define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { class ICLTensor; -/** Basic function to run @ref CLPixelWiseDivisionKernel. */ +/** + * @brief Class to run @ref CLPixelWiseDivisionKernel. + */ class CLPixelWiseDivision : public ICLSimpleFunction { public: - /** Initialise the kernel's inputs, output and convertion policy. - * - * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32 * The input tensor is [in, out] because its TensorInfo might be * modified inside the kernel in case of broadcasting of dimension 0. * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. * The input tensor is [in, out] because its TensorInfo might be * modified inside the kernel in case of broadcasting of dimension 0. * @param[out] output The output tensor, Data types supported: same as @p input1. - * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * Note: U8 requires both inputs to be U8. * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or - * 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest * even. + * @return N/A */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, ConvertPolicy overflow_policy = ConvertPolicy::WRAP, RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); - /** Static function to check if given info will lead to a valid configuration of @ref + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref * CLPixelWiseDivision - * - * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32 * @param[in] input2 An input tensor info. Data types supported: same as @p input1. * @param[in] output The output tensor info, Data types supported: same as @p input1. - * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * Note: U8 requires both inputs to be U8. * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n - * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h deleted file mode 100644 index 14b473f33..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__ -#define __ARM_COMPUTE_CLREDUCE_MAX_H__ - -#include "arm_compute/runtime/CL/CLArray.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to execute TopK operation. This function calls the following OpenCL kernels: - * - * -# @ref CLTopKV2Kernel - */ -class CLReduceMax : public IFunction -{ -public: - /** Constructor */ - CLReduceMax(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLReduceMax(const CLReduceMax &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLReduceMax &operator=(const CLReduceMax &) = delete; - /** Allow instances of this class to be moved */ - CLReduceMax(CLReduceMax &&) = default; - /** Allow instances of this class to be moved */ - CLReduceMax &operator=(CLReduceMax &&) = default; - /** Initialise the kernel's inputs and outputs. - * - * @note When locations of min and max occurrences are requested, the reported number of locations - * is limited to the given array size. - * - * @param[in] input Input image. Data types supported: F32 - * @param[in] axis Axis to reduce. Data type supported: S32 - * @param[out] output indices related to top k values. Data types supported: F32. - */ - void configure(ICLTensor *input, int32_t axis, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLPixelWiseDivision - * - * @param[in] input Input image. Data types supported: F32 - * @param[in] axis Axis to reduce. Data type supported: S32 - * @param[out] output indices related to top k values. Data types supported: F32. * - * - * @return a status - */ - static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output); - - // Inherited methods overridden: - void run() override; - -private: - void run_on_cpu(); - - int32_t _axis; - - ICLTensor *_input; - ICLTensor *_output; - - std::unique_ptr<ICLKernel> _kernel; -}; -} -#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h new file mode 100644 index 000000000..e1a6f6ab4 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLReduceOperation.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLReduceOperation class + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATION_H__ + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform ReduceOperation + */ +class CLReduceOperation : public IFunction +{ +public: + /** + * @brief Construct a new ReduceOperation object + */ + CLReduceOperation(); + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, + ReduceOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperation. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, const ReduceOperation &op); + + /** + * @brief Run the OpenCL kernel for this operation + * @return N/A + */ + void run() override; + +private: + ICLTensor *_input; + ICLTensor *_output; + std::set<uint32_t> _axis; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; +}; +} +#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h deleted file mode 100644 index 2081518c1..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__ -#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__ - -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" - -#include <cstdint> -#include <memory> -#include <vector> - -namespace arm_compute -{ -class ICLTensor; - -/** Perform reduction operation. - */ -class CLReductionMean : public IFunction -{ -public: - /** Default Constructor. - */ - CLReductionMean(); - - /** Set the input and output tensors. - * - * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1 - */ - void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis); - - /** Static function to check if given info will lead to a valid configuration of @ref - * CLReductionMean. - * - * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. - * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p - * input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1 - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis); - - // Inherited methods overridden: - void run() override; - -private: - CLReductionMeanKernel _reduction_mean_kernel; - CLFillBorderKernel _fill_border_kernel; -}; -} -#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h new file mode 100644 index 000000000..7e2df8986 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ +#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSpaceToBatchNDKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32. + * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape + * block_shape, and interleaves these blocks with the "batch" dimension such that in the output. + */ +class CLSpaceToBatchND : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @note The data layout of input and output must be the same. + * @note The number of dimensions of input and output must be 4, and `spatial` dimensions + * are height and width. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + * @param[in] block_size Tensor of integer values specifying block sizes for spatial + * dimension. + * Data types supported: S32 + * @param[in] padding_size Tensor of integer values specifying padding sizes for spatial + * dimension. + * Data types supported: S32 + * @param[out] output Output tensor. Data types supported: same as @p input. + * Data layout supported: NCHW/NHWC + */ + void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, + ICLTensor *output); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h new file mode 100644 index 000000000..17f762092 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__ +#define __ARM_COMPUTE_CLSPACETODEPTH_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSpaceToDepthKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLSpaceToDepth : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[block_size] block size integer only + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h new file mode 100644 index 000000000..3610ba71c --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__ +#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLSquaredDifference : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] input2 Source tensor2. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__*/ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h deleted file mode 100644 index f223a79be..000000000 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__ -#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__ - -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to run @ref CLStridedSliceKernel */ -class CLStridedSlice : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs and outputs - * - * @param[in] input First tensor input. Data type supported: - * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 - * @param[out] output Output tensor. Data type supported: Same as @p input - */ - void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask); -}; - -class CLStridedSliceCPU : public IFunction -{ -public: - /** Initialise inputs and outputs - * - * @param[in] input First tensor input. - * @param[out] output Output tensor. - */ - void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, - ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask); - - void run() override; - -private: - void run_on_cpu(); - - ICLTensor *_input; - ICLTensor *_output; - ICLTensor *_beginData; - ICLTensor *_endData; - ICLTensor *_stridesData; - int32_t _beginMask; - int32_t _endMask; - int32_t _shrinkAxisMask; -}; -} -#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h new file mode 100644 index 000000000..6b26a85c8 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLStridedSlice.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class + */ + +#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ +#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLStridedSliceKernel + */ +class CLStridedSliceEx : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs and outputs + * @param[in] input Tensor input. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Output tensor. Data type supported: Same as @p input + * @param[in] beginData 'begin' vector of strided slice operation + * @param[in] endData 'end' vector of strided slice operation + * @param[in] stridesData 'strides' vector of strided slice operation + * @param[in] beginMask If the ith bit is set, begin[i] is ignored + * @param[in] endMask If the ith bit is set, end[i] is ignored + * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the + * dimensionality by 1, taking on the value at index begin[i] + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask); +}; +} +#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h index 06cd1ee9b..5327e016f 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -14,51 +14,79 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * @file CLTopKV2.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLTopKV2 class + */ #ifndef __ARM_COMPUTE_CLTOPK_V2_H__ #define __ARM_COMPUTE_CLTOPK_V2_H__ #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" -#include "arm_compute/runtime/CL/CLArray.h" #include "arm_compute/runtime/IFunction.h" namespace arm_compute { class ICLTensor; -/** Basic function to execute TopK operation. This function calls the following OpenCL kernels: - * - * -# @ref CLTopKV2Kernel +/** + * @brief Class to execute TopKV2 operation. */ class CLTopKV2 : public IFunction { public: - /** Constructor */ + /** + * @brief Construct a new CLTopKV2 object + */ CLTopKV2(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ CLTopKV2(const CLTopKV2 &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ CLTopKV2 &operator=(const CLTopKV2 &) = delete; - /** Allow instances of this class to be moved */ + + /** + * @brief Construct a new CLTopKV2 object by using copy constructor + * @param[in] CLTopKV2 object to move + */ CLTopKV2(CLTopKV2 &&) = default; - /** Allow instances of this class to be moved */ + + /** + * @brief Assign a CLTopKV2 object. + * @param[in] CLTopKV2 object to assign. This object will be moved. + */ CLTopKV2 &operator=(CLTopKV2 &&) = default; - /** Initialise the kernel's inputs and outputs. - * - * @note When locations of min and max occurrences are requested, the reported number of locations - * is limited to the given array size. - * + + /** + * @brief Initialise the kernel's inputs and outputs. * @param[in] input Input image. Data types supported: U8/S16/F32. * @param[in] k The value of `k`. * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if * input type is F32. - * @param[out] indices indices related to top k values. Data types supported: S32 if input type + * @param[out] indices Indices related to top k values. Data types supported: S32 if input type * is U8/S16, F32 if input type is F32. + * @return N/A */ void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, int total_bits = 32, int bits = 4); - // Inherited methods overridden: + /** + * @brief Run the kernels contained in the function + * Depending on the value of the following environment variables it works differently: + * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE", + * quick sort on GPU is used. + * - If the value of environment variable "ACL_TOPKV2" == ""GPU"", + * radix sort on GPU is used. + * - For other value, TopKV2 runs on CPU + * @return N/A + */ void run() override; private: diff --git a/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h new file mode 100644 index 000000000..fa7408ecd --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h" +#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to compute a normalization layer. This function calls the following NEON kernels: + * + * -# @ref NEPixelWiseMultiplicationKernel + * -# @ref NEFillBorderKernel + * -# @ref NENormalizationLayerKernelEx + * + */ +class NENormalizationLayerEx : public IFunction +{ +public: + /** Default constructor */ + NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data type supported: + * F16/F32 + * @param[out] output Destination with the same dimensions, data type and number of channels of + * @p input + * @param[in] norm_info Normalization layer information like the normalization type, + * normalization size and other parameters. + */ + void configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref + * NENormalizationLayer + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions + * [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data type supported: + * F16/F32 + * @param[in] output Destination with the same dimensions, data type and number of channels of + * @p input + * @param[in] norm_info Normalization layer information like the normalization type, normalization + * size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const NormalizationLayerInfo &norm_info); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; /**< Function memory group */ + NENormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel */ + NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */ + NEFillBorderKernel _border_handler; /**< Kernel to handle borders */ + Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ +}; +} +#endif /* __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ */ diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index d535c5da4..05ecdeb22 100644 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -38,255 +38,37 @@ using namespace arm_compute; const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { - {"absdiff", "absdiff.cl"}, - {"accumulate", "accumulate.cl"}, - {"accumulate_squared", "accumulate.cl"}, - {"accumulate_weighted", "accumulate.cl"}, - {"activation_layer", "activation_layer.cl"}, - {"activation_layer_qa8", "activation_layer_qa8.cl"}, - {"activation_layer_logistic_qa8", "activation_layer_qa8.cl"}, - {"arithmetic_add", "arithmetic_op.cl"}, - {"arithmetic_sub", "arithmetic_op.cl"}, + // ARMComputeEx kernels + {"activation_layer_ex", "activation_layer_ex.cl"}, + {"arg_op", "arg_operation.cl"}, + {"arithmetic_sub_ex", "arithmetic_op_ex.cl"}, {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"}, - {"batchnormalization_layer_nchw", "batchnormalization_layer.cl"}, - {"batchnormalization_layer_nhwc", "batchnormalization_layer.cl"}, - {"bitwise_or", "bitwise_op.cl"}, - {"bitwise_and", "bitwise_op.cl"}, - {"bitwise_xor", "bitwise_op.cl"}, - {"bitwise_not", "bitwise_op.cl"}, + {"batch_to_space_nd", "batch_to_space_nd.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, {"cast", "cast.cl"}, {"cast_qasymm_in", "cast.cl"}, {"cast_qasymm_out", "cast.cl"}, - {"channel_combine_NV", "channel_combine.cl"}, - {"channel_combine_RGB888", "channel_combine.cl"}, - {"channel_combine_RGBA8888", "channel_combine.cl"}, - {"channel_combine_UYVY422", "channel_combine.cl"}, - {"channel_combine_YUYV422", "channel_combine.cl"}, - {"channel_shuffle_nchw", "channel_shuffle.cl"}, - {"channel_extract_NV12", "channel_extract.cl"}, - {"channel_extract_NV21", "channel_extract.cl"}, - {"channel_extract_RGB888", "channel_extract.cl"}, - {"channel_extract_RGBA8888", "channel_extract.cl"}, - {"channel_extract_UYVY422", "channel_extract.cl"}, - {"channel_extract_YUYV422", "channel_extract.cl"}, - {"combine_gradients_L1", "canny.cl"}, - {"combine_gradients_L2", "canny.cl"}, - {"concatenate_depth", "concatenate.cl"}, - {"concatenate_width", "concatenate.cl"}, - {"convolution_rectangle", "convolution_rectangle.cl"}, - {"col2im", "col2im.cl"}, - {"convert_depth_down", "depth_convert.cl"}, - {"convert_depth_up", "depth_convert.cl"}, - {"convert_fc_weights", "convert_fc_weights.cl"}, - {"convolution3x3_static", "convolution3x3.cl"}, - {"convolution5x5_static", "convolution5x5.cl"}, - {"convolution7x7_static", "convolution7x7.cl"}, - {"convolution9x9_static", "convolution9x9.cl"}, - {"convolution_separable1x5_static", "convolution5x5.cl"}, - {"convolution_separable5x1_static", "convolution5x5.cl"}, - {"convolution_separable1x7_static", "convolution7x7.cl"}, - {"convolution_separable7x1_static", "convolution7x7.cl"}, - {"convolution_separable1x9_static", "convolution9x9.cl"}, - {"convolution_separable9x1_static", "convolution9x9.cl"}, - {"copy_tensor", "copy_tensor.cl"}, - {"copy_plane", "channel_extract.cl"}, - {"copy_planes_3p", "channel_combine.cl"}, - {"copy_to_keypoint", "fast_corners.cl"}, - {"deconvolution_upsample", "deconvolution_layer.cl"}, - {"depthwise_convolution_3x3", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_f16", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl"}, - {"depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl"}, - {"depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl"}, - {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl"}, - {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl"}, - {"depthwise_im2col", "depthwise_convolution.cl"}, - {"depthwise_vector_to_tensor", "depthwise_convolution.cl"}, - {"depthwise_weights_reshape", "depthwise_convolution.cl"}, - {"dequantization_layer", "dequantization_layer.cl"}, - {"derivative", "derivative.cl"}, - {"dilate", "dilate.cl"}, - {"direct_convolution1x1", "direct_convolution1x1.cl"}, - {"direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl"}, - {"direct_convolution3x3", "direct_convolution3x3.cl"}, - {"direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl"}, - {"direct_convolution5x5", "direct_convolution5x5.cl"}, - {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"}, - {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"}, - {"erode", "erode.cl"}, - {"fast_corners", "fast_corners.cl"}, - {"fill_image_borders_constant", "fill_border.cl"}, - {"fill_image_borders_replicate", "fill_border.cl"}, - {"finalize", "optical_flow_pyramid_lk.cl"}, - {"floor_layer", "floor.cl"}, + {"comparison_op", "comparison_op.cl"}, + {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, + {"depth_to_space", "depth_to_space.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"exp_layer", "exp.cl"}, {"gather", "gather.cl"}, {"gather_1d", "gather.cl"}, {"gather_1d_out", "gather.cl"}, - {"gaussian1x5_sub_x", "gaussian_pyramid.cl"}, - {"gaussian5x1_sub_y", "gaussian_pyramid.cl"}, - {"gemm_accumulate_biases", "gemm.cl"}, - {"gemm_interleave4x4", "gemm.cl"}, - {"gemm_ma_f16", "gemm.cl"}, - {"gemm_ma_f32", "gemm.cl"}, - {"gemm_ma_qs8", "gemm.cl"}, - {"gemm_ma_qs16", "gemm.cl"}, - {"gemm_mv", "gemv.cl"}, - {"gemm_mv_quantized", "gemv.cl"}, - {"gemm_mm_interleaved_transposed_f16", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_f32", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_qs8", "gemm.cl"}, - {"gemm_mm_interleaved_transposed_qs16", "gemm.cl"}, - {"gemm_mm_floating_point", "gemm.cl"}, - {"gemm_mm_floating_point_f16_bifrost", "gemm.cl"}, - {"gemm_mm_floating_point_f32_bifrost", "gemm.cl"}, - {"gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl"}, - {"gemm_mm_qs8", "gemm.cl"}, - {"gemm_mm_qs16", "gemm.cl"}, - {"gemm_lc_vm_f32", "gemm.cl"}, - {"gemm_transpose1xW", "gemm.cl"}, - {"gemmlowp_matrix_a_reduction", "gemmlowp.cl"}, - {"gemmlowp_matrix_b_reduction", "gemmlowp.cl"}, - {"gemmlowp_mm_bifrost", "gemmlowp.cl"}, - {"gemmlowp_mm_midgard", "gemmlowp.cl"}, - {"gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl"}, - {"gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl"}, - {"gemmlowp_offset_contribution", "gemmlowp.cl"}, - {"gemmlowp_output_stage_quantize_down", "gemmlowp.cl"}, - {"gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl"}, - {"harris_score_3x3", "harris_corners.cl"}, - {"harris_score_5x5", "harris_corners.cl"}, - {"harris_score_7x7", "harris_corners.cl"}, - {"hist_border_kernel", "histogram.cl"}, - {"hist_border_kernel_fixed", "histogram.cl"}, - {"hist_local_kernel", "histogram.cl"}, - {"hist_local_kernel_fixed", "histogram.cl"}, - {"hog_block_normalization", "hog.cl"}, - {"hog_detector", "hog.cl"}, - {"hog_orientation_binning", "hog.cl"}, - {"hysteresis", "canny.cl"}, - {"im2col1x1_stridex1_dchw", "im2col.cl"}, - {"im2col3x3_dchw", "im2col.cl"}, - {"im2col5x5_dchw", "im2col.cl"}, - {"im2col11x11_padx0_pady0_dchw", "im2col.cl"}, - {"im2col_generic_dchw", "im2col.cl"}, - {"im2col_generic_padx0_pady0_dchw", "im2col.cl"}, - {"im2col_reduced_dchw", "im2col.cl"}, - {"init_level", "optical_flow_pyramid_lk.cl"}, - {"init_level_max", "optical_flow_pyramid_lk.cl"}, - {"init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl"}, - {"integral_horizontal", "integral_image.cl"}, - {"integral_vertical", "integral_image.cl"}, - {"IYUV_to_NV12_bt709", "color_convert.cl"}, - {"IYUV_to_RGB888_bt709", "color_convert.cl"}, - {"IYUV_to_RGBA8888_bt709", "color_convert.cl"}, - {"IYUV_to_YUV444_bt709", "color_convert.cl"}, - {"l2_normalize", "l2_normalize.cl"}, - {"lktracker_stage0", "optical_flow_pyramid_lk.cl"}, - {"lktracker_stage1", "optical_flow_pyramid_lk.cl"}, - {"magnitude_phase", "magnitude_phase.cl"}, - {"mean_stddev_accumulate", "mean_stddev.cl"}, - {"minmax", "minmaxloc.cl"}, - {"minmax_border", "minmaxloc.cl"}, - {"minmax_layer", "minmax_layer.cl"}, - {"minmaxloc", "minmaxloc.cl"}, - {"non_linear_filter_box3x3", "non_linear_filter3x3.cl"}, - {"non_linear_filter_cross3x3", "non_linear_filter3x3.cl"}, - {"non_linear_filter_disk3x3", "non_linear_filter3x3.cl"}, - {"non_linear_filter_box5x5", "non_linear_filter5x5.cl"}, - {"non_linear_filter_cross5x5", "non_linear_filter5x5.cl"}, - {"non_linear_filter_disk5x5", "non_linear_filter5x5.cl"}, - {"non_max_suppression", "nonmax.cl"}, - {"normalization_layer_cross_map", "normalization_layer.cl"}, - {"normalization_layer_in_map", "normalization_layer.cl"}, - {"NV12_to_IYUV_bt709", "color_convert.cl"}, - {"NV12_to_RGB888_bt709", "color_convert.cl"}, - {"NV12_to_RGBA8888_bt709", "color_convert.cl"}, - {"NV12_to_YUV444_bt709", "color_convert.cl"}, - {"NV21_to_IYUV_bt709", "color_convert.cl"}, - {"NV21_to_RGB888_bt709", "color_convert.cl"}, - {"NV21_to_RGBA8888_bt709", "color_convert.cl"}, - {"NV21_to_YUV444_bt709", "color_convert.cl"}, - {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"}, - {"permute_201", "permute.cl"}, - {"permute_120", "permute.cl"}, - {"permute_3201", "permute.cl"}, - {"pixelwise_mul_float", "pixelwise_mul_float.cl"}, - {"pixelwise_mul_int", "pixelwise_mul_int.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"pad", "pad.cl"}, + {"permute_generic", "permute_ex.cl"}, {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, {"pixelwise_div_float", "pixelwise_div_float.cl"}, {"pixelwise_div_int", "pixelwise_div_int.cl"}, - {"pooling_layer_2", "pooling_layer.cl"}, - {"pooling_layer_3", "pooling_layer.cl"}, - {"pooling_layer_optimized_3", "pooling_layer.cl"}, - {"pooling_layer_7", "pooling_layer.cl"}, - {"pooling_layer_MxN_nchw", "pooling_layer.cl"}, - {"pooling_layer_MxN_nhwc", "pooling_layer.cl"}, - {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"}, - {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"}, - {"quantization_layer", "quantization_layer.cl"}, - {"reduce_max", "reduce_max.cl"}, - {"reduction_operation", "reduction_operation.cl"}, - {"reduction_mean", "reduction_mean.cl"}, - {"remap_nearest_neighbour", "remap.cl"}, - {"remap_bilinear", "remap.cl"}, - {"reshape_layer", "reshape_layer.cl"}, - {"reshape_to_columns", "convolution_layer.cl"}, - {"RGB888_to_IYUV_bt709", "color_convert.cl"}, - {"RGB888_to_NV12_bt709", "color_convert.cl"}, - {"RGB888_to_RGBA8888_bt709", "color_convert.cl"}, - {"RGB888_to_YUV444_bt709", "color_convert.cl"}, - {"RGBA8888_to_IYUV_bt709", "color_convert.cl"}, - {"RGBA8888_to_NV12_bt709", "color_convert.cl"}, - {"RGBA8888_to_RGB888_bt709", "color_convert.cl"}, - {"RGBA8888_to_YUV444_bt709", "color_convert.cl"}, - {"roi_pooling_layer", "roi_pooling_layer.cl"}, - {"scale_nearest_neighbour", "scale.cl"}, - {"scale_bilinear", "scale.cl"}, - {"scharr3x3", "scharr_filter.cl"}, - {"sobel3x3", "sobel_filter.cl"}, - {"sobel_separable5x1", "sobel_filter.cl"}, - {"sobel_separable1x5", "sobel_filter.cl"}, - {"sobel_separable7x1", "sobel_filter.cl"}, - {"sobel_separable1x7", "sobel_filter.cl"}, - {"softmax_layer_norm", "softmax_layer.cl"}, - {"softmax_layer_norm_quantized", "softmax_layer_quantized.cl"}, - {"softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl"}, - {"softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl"}, - {"softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl"}, - {"softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl"}, - {"strided_slice", "strided_slice.cl"}, - {"suppress_non_maximum", "canny.cl"}, - {"tablelookup_U8", "tablelookup.cl"}, - {"tablelookup_S16", "tablelookup.cl"}, - {"threshold_binary", "threshold.cl"}, - {"threshold_range", "threshold.cl"}, - {"transpose", "transpose.cl"}, - {"UYVY422_to_IYUV_bt709", "color_convert.cl"}, - {"UYVY422_to_NV12_bt709", "color_convert.cl"}, - {"UYVY422_to_RGB888_bt709", "color_convert.cl"}, - {"UYVY422_to_RGBA8888_bt709", "color_convert.cl"}, - {"warp_affine_nearest_neighbour", "warp_affine.cl"}, - {"warp_affine_bilinear", "warp_affine.cl"}, - {"warp_perspective_nearest_neighbour", "warp_perspective.cl"}, - {"warp_perspective_bilinear", "warp_perspective.cl"}, - {"winograd_filter_transform_2x2_3x3_nchw", "winograd.cl"}, - {"winograd_filter_transform_4x4_3x3_nchw", "winograd.cl"}, - {"winograd_filter_transform_4x4_5x5_nchw", "winograd.cl"}, - {"winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl"}, - {"winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl"}, - {"winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl"}, - {"winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl"}, - {"winograd_output_transform_2x2_3x3_nchw", "winograd.cl"}, - {"winograd_output_transform_4x4_3x3_nchw", "winograd.cl"}, - {"winograd_output_transform_4x4_5x5_nchw", "winograd.cl"}, - {"YUYV422_to_IYUV_bt709", "color_convert.cl"}, - {"YUYV422_to_NV12_bt709", "color_convert.cl"}, - {"YUYV422_to_RGB888_bt709", "color_convert.cl"}, - {"YUYV422_to_RGBA8888_bt709", "color_convert.cl"}, + {"prelu", "prelu.cl"}, + {"prelu_qasymm8", "prelu_quantized.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"squared_difference", "squared_difference.cl"}, + {"strided_slice_ex", "strided_slice_ex.cl"}, {"topkv2_init", "topkv2.cl"}, {"topkv2_find_first_negative", "topkv2.cl"}, {"topkv2_reorder_negatives", "topkv2.cl"}, @@ -296,23 +78,62 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, {"radixsort_reorder", "topkv2_radixsort.cl"}, {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"space_to_batch_4d_nchw", "space_to_batch.cl"}, + {"space_to_batch_4d_nhwc", "space_to_batch.cl"}, + {"space_to_depth", "space_to_depth.cl"}, }; const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { #ifdef EMBEDDED_KERNELS { + "activation_layer_ex.cl", +#include "./cl_kernels/activation_layer_ex.clembed" + }, + { + "arg_operation.cl", +#include "./cl_kernels/arg_operation.clembed" + }, + { + "arithmetic_op_ex.cl", +#include "./cl_kernels/arithmetic_op_ex.clembed" + }, + { + "batch_to_space_nd.cl", +#include "./cl_kernels/batch_to_space_nd.clembed" + }, + { "cast.cl", #include "./cl_kernels/cast.clembed" }, { - "fixed_point.h", -#include "./cl_kernels/fixed_point.hembed" + "comparison_op.cl", +#include "./cl_kernels/comparison_op.clembed" + }, + { + "comparison_op_quantized.cl", +#include "./cl_kernels/comparison_op_quantized.clembed" + }, + { + "embedding_lookup.cl", +#include "./cl_kernels/embedding_lookup.clembed" + }, + { + "depth_to_space.cl", +#include "./cl_kernels/depth_to_space.clembed" + }, + { + "exp.cl", +#include "./cl_kernels/exp.clembed" }, { "gather.cl", #include "./cl_kernels/gather.clembed" }, { + "hashtable_lookup.cl", +#include "./cl_kernels/hashtable_lookup.clembed" + }, + { "helpers.h", #include "./cl_kernels/helpers.hembed" }, @@ -321,6 +142,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map #include "./cl_kernels/helpers_asymm.hembed" }, { + "binary_logical_op.cl", +#include "./cl_kernels/binary_logical_op.clembed" + }, + { + "neg_tensor.cl", +#include "./cl_kernels/neg_tensor.clembed" + }, + { + "pad.cl", +#include "./cl_kernels/pad.clembed" + }, + { "pixelwise_div_float.cl", #include "./cl_kernels/pixelwise_div_float.clembed" }, @@ -329,16 +162,32 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map #include "./cl_kernels/pixelwise_div_int.clembed" }, { - "reduce_max.cl", -#include "./cl_kernels/reduce_max.clembed" + "prelu.cl", +#include "./cl_kernels/prelu.clembed" + }, + { + "prelu_quantized.cl", +#include "./cl_kernels/prelu_quantized.clembed" + }, + { + "reduce_operation.cl", +#include "./cl_kernels/reduce_operation.clembed" + }, + { + "space_to_batch.cl", +#include "./cl_kernels/space_to_batch.clembed" }, { - "reduction_mean.cl", -#include "./cl_kernels/reduction_mean.clembed" + "space_to_depth.cl", +#include "./cl_kernels/space_to_depth.clembed" }, { - "strided_slice.cl", -#include "./cl_kernels/strided_slice.clembed" + "squared_difference.cl", +#include "./cl_kernels/squared_difference.clembed" + }, + { + "strided_slice_ex.cl", +#include "./cl_kernels/strided_slice_ex.clembed" }, { "topkv2.cl", @@ -352,6 +201,11 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map "topkv2_quicksort.cl", #include "./cl_kernels/topkv2_quicksort.clembed" }, + { + "permute_ex.cl", +#include "./cl_kernels/permute_ex.clembed" + }, + #endif /* EMBEDDED_KERNELS */ }; @@ -359,7 +213,7 @@ CLKernelLibraryEx::CLKernelLibraryEx() : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the - // CLKernelLibrary is built + // CLKernelLibraryEx is built } CLKernelLibraryEx &CLKernelLibraryEx::get() @@ -380,7 +234,7 @@ Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name, } std::string concat_str; - if (fp16_supported(_device)) + if (fp16_supported()) { concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; } @@ -434,6 +288,13 @@ void CLKernelLibraryEx::add_built_program(const std::string &built_program_name, _built_programs_map.emplace(built_program_name, program); } +bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); } + +bool CLKernelLibraryEx::int64_base_atomics_supported() const +{ + return device_supports_extension(_device, "cl_khr_int64_base_atomics"); +} + const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const { const auto program_it = _programs_map.find(program_name); @@ -525,6 +386,7 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con cl::NDRange CLKernelLibraryEx::default_ndrange() const { + // GPUTarget _target = get_target_from_device(_device); cl::Device device = cl::Device::getDefault(); GPUTarget _target = get_target_from_device(device); cl::NDRange default_range; diff --git a/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp new file mode 100644 index 000000000..cbda169fb --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/OpenCLEx.h" + +#include <dlfcn.h> +#include <iostream> + +namespace arm_compute +{ +CLSymbolsEx &CLSymbolsEx::get() +{ + static CLSymbolsEx symbols; + return symbols; +} + +bool CLSymbolsEx::load_default() +{ + static const std::vector<std::string> libraries{"libOpenCL.so", "libGLES_mali.so", "libmali.so"}; + + if (_loaded.first) + { + return _loaded.second; + } + + // Indicate that default loading has been tried + _loaded.first = true; + + for (const auto &lib : libraries) + { + if (load(lib)) + { + return true; + } + } + + std::cerr << "Couldn't find any OpenCL library.\n"; + return false; +} + +bool CLSymbolsEx::load(const std::string &library) +{ + void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL); + + if (handle == nullptr) + { + std::cerr << "Can't load " << library << ": " << dlerror() << "\n"; + // Set status of loading to failed + _loaded.second = false; + return false; + } + +#define LOAD_FUNCTION_PTR(func_name, handle) \ + func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name)); + + LOAD_FUNCTION_PTR(clGetEventInfo, handle); + LOAD_FUNCTION_PTR(clSetEventCallback, handle); + +#undef LOAD_FUNCTION_PTR + + // Don't call dlclose(handle) or all the symbols will be unloaded ! + + // Disable default loading and set status to successful + _loaded = std::make_pair(true, true); + + return true; +} + +} // namespace arm_compute + +cl_int clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) +{ + arm_compute::CLSymbolsEx::get().load_default(); + auto func = arm_compute::CLSymbolsEx::get().clGetEventInfo_ptr; + if (func != nullptr) + { + return func(event, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clSetEventCallback(cl_event event, cl_int command_exec_callback_type, + void(CL_CALLBACK *pfn_ev_notify)(cl_event ev, cl_int ev_cmd_exec_status, + void *user_data), + void *user_data) +{ + arm_compute::CLSymbolsEx::get().load_default(); + auto func = arm_compute::CLSymbolsEx::get().clSetEventCallback_ptr; + if (func != nullptr) + { + return func(event, command_exec_callback_type, pfn_ev_notify, user_data); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl new file mode 100644 index 000000000..f54c7bde3 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +#define CONST_ONE 1.f +#define DIV_OP(a, b) ((a) / (b)) +#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a))) + +// Inverse Square-root Activation +inline TYPE rsqrt_op(TYPE x) +{ + return RSQRT_OP(x); +} + +#define ACTIVATION_OP2(op, x) op##_op(x) +#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) + +#if defined(ACT) + +/** This performs an activation function floating point inputs. + * + * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH + * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void activation_layer_ex( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load data + TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr); + + // Perform activation + data = ACTIVATION_OP(ACT, data); + + // Store result + VSTORE(VEC_SIZE) + (data, 0, (__global DATA_TYPE *)output.ptr); +} + +#endif /* defined(ACT) */ diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl new file mode 100644 index 000000000..9a6921d7c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform arg_max/arg_min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: U32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs for max value index + * @param[in] dim Dimension across the axis to be reduced. + */ + +__kernel void arg_op(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + DATA_TYPE tval = value; + int idx = 0; + for(int i = 1; i < dim; ++i) + { + indices[axis] = i; + + #if OP_CODE == 1 // ArgMax + value = max(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + #elif OP_CODE == 2 //ArgMin + value = min(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + #else + return; + + #endif + + if(tval!=value) + { + idx = indices[axis]; + tval = value; + } + } + + *((__global uint *)out.ptr) = idx; +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl new file mode 100644 index 000000000..2ed698951 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifdef SATURATE +#define SUB(x, y) sub_sat((x), (y)) +#else /* SATURATE */ +#define SUB(x, y) (x) - (y) +#endif /* SATURATE */ + +/** This function subtracts one tensors from another. + * + * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * + * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16 + * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16 + * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16 + * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void arithmetic_sub_ex( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load values + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); + + // Calculate and store result + vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl index 0c0a9ede6..5cd0a4309 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl @@ -2,32 +2,20 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers_asymm.h" -#if defined(FIXED_POINT_POSITION) -#include "fixed_point.h" -#endif /* FIXED_POINT_POSITION */ - #ifdef SATURATE #define ADD(x, y) add_sat((x), (y)) #define SUB(x, y) sub_sat((x), (y)) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl new file mode 100644 index 000000000..ad6a48a02 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) +/** Perform batch to space rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void batch_to_space_nd( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int out_index[4]={0}; + int in_index[4]={0}; + + out_index[0] = get_global_id(0);//W + out_index[1] = get_global_id(1);//H + out_index[2] = get_global_id(2) % DEPTH_OUT;//C + out_index[3] = get_global_id(2) / DEPTH_OUT;//N + + in_index[0] = out_index[0]/BLOCK_SIZE1; + in_index[1] = out_index[1]/BLOCK_SIZE0; + in_index[2] = out_index[2]; + in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl new file mode 100644 index 000000000..bea61f53e --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OP_CODE) && defined(DATA_TYPE) +/** returns truth value of the two input tensors for BINARY LOGICAL OP. + * where BINARY LOGICAL OP can be AND, OR. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor.Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + */ +__kernel void binary_logical_op( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + #if OP_CODE == 1 // LOGICAL AND + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); + + #elif OP_CODE == 2 // LOGICAL OR + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); + + #else // OP NOT SUPPORTED + return + + #endif +} +#endif //if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl index 113804cca..3d4675e5d 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -2,38 +2,34 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" -#ifndef SCALE_IN -#define SCALE_IN 1.0f +#ifndef SCALE +#define SCALE 1.0f +#endif +#ifndef OFFSET +#define OFFSET 0 #endif -#ifndef OFFSET_IN -#define OFFSET_IN 0 +#ifndef VEC_SIZE +#define VEC_SIZE 1 #endif +#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) /** Perform a cast operation on an input tensor. * - * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -65,9 +61,9 @@ __kernel void cast( 0, (__global DATA_TYPE_OUT *)output.ptr); } - /** Perform a cast operation on an QASYMM8 input tensor. - * + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -96,8 +92,8 @@ __kernel void cast_qasymm_in( VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; @@ -108,7 +104,8 @@ __kernel void cast_qasymm_in( /** Perform a cast operation on an QASYMM8 output tensor. - * + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 @@ -137,8 +134,8 @@ __kernel void cast_qasymm_out( VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); @@ -146,3 +143,4 @@ __kernel void cast_qasymm_out( VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); } +#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl new file mode 100644 index 000000000..765072556 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) +/** Returns truth value of comparison operators. + * Comparison operators may be equal, not_equal etc. + * + * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT, + * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void comparison_op( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + #if OP_CODE == 1 //EQUAL + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr); + + #elif OP_CODE == 2 //NOT_EQUAL + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE) + (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl new file mode 100644 index 000000000..1eb305f7b --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" +#define SUB(x, y) (x) - (y) + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) + +/** Returns the truth value of comparison . + * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[in] input2_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void comparison_op_qasymm8( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT); + VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT); + + in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1)); + in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2)); + + const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); + const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); + + #if OPCODE == 1 //EQUAL QUANTIZED + VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); + + #elif OPCODE == 2 //NOT EQUAL QUANTIZED + VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl new file mode 100644 index 000000000..fef2243e7 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void depth_to_space( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int out_index[4]={0}; + int in_index[4]={0}; + + out_index[0] = get_global_id(0);//W + out_index[1] = get_global_id(1);//H + out_index[2] = get_global_id(2) % DEPTH_OUT;//C + out_index[3] = get_global_id(2) / DEPTH_OUT;//B + + in_index[0] = out_index[0]/BLOCK_SIZE; + in_index[1] = out_index[1]/BLOCK_SIZE; + in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; + in_index[3] = out_index[3]; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl new file mode 100644 index 000000000..348458fe9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform embedding_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector + */ + +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + //lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) + :get_global_id(0); + lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) + :get_global_id(1); + lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) + :get_global_id(2)%DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + :get_global_id(2) / DEPTH_OUT; + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl new file mode 100644 index 000000000..69d94f30a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Perform an exponential operation on an input tensor. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void exp_layer( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h deleted file mode 100644 index 7807533e2..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h +++ /dev/null @@ -1,565 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_FIXED_POINT_H -#define ARM_COMPUTE_FIXED_POINT_H - -#define TYPE_ALIAS(type, alias) \ - typedef type alias; \ - typedef type alias##x##1; \ - typedef type##2 alias##x##2; \ - typedef type##3 alias##x##3; \ - typedef type##4 alias##x##4; \ - typedef type##8 alias##x##8; \ - typedef type##16 alias##x##16; - -TYPE_ALIAS(char, qs8) -TYPE_ALIAS(short, qs16) -TYPE_ALIAS(int, qs32) - -#define qs8_MIN ((char)CHAR_MIN) -#define qs8_MAX ((char)CHAR_MAX) -#define qs16_MIN ((short)SHRT_MIN) -#define qs16_MAX ((short)SHRT_MAX) -#define qs32_MIN ((int)INT_MIN) -#define qs32_MAX ((int)INT_MAX) - -#define qu8_MIN ((uchar)0) -#define qu8_MAX ((uchar)UCHAR_MAX) -#define qu16_MIN ((ushort)0) -#define qu16_MAX ((ushort)USHRT_MAX) -#define qu32_MIN ((uint)0) -#define qu32_MAX ((uint)UINT_MAX) - -#define qs8_TYPE char -#define qs8x1_TYPE char -#define qs8x2_TYPE char2 -#define qs8x3_TYPE char3 -#define qs8x4_TYPE char4 -#define qs8x8_TYPE char8 -#define qs8x16_TYPE char16 - -#define qs16_TYPE short -#define qs16x1_TYPE short -#define qs16x2_TYPE short2 -#define qs16x3_TYPE short3 -#define qs16x4_TYPE short4 -#define qs16x8_TYPE short8 -#define qs16x16_TYPE short16 - -#define qs32_TYPE int -#define qs32x1_TYPE int -#define qs32x2_TYPE int2 -#define qs32x3_TYPE int3 -#define qs32x4_TYPE int4 -#define qs32x8_TYPE int8 -#define qs32x16_TYPE int16 - -/* All internal constants are represented in the maximum supported fixed point format (QS16), - * thus we define an additional shift parameter required to convert the constant - * from the maximum supported format to the require one. - */ -#define qs8_SHIFT 8 -#define qs16_SHIFT 0 - -#undef VEC_DATA_TYPE_STR -#undef VEC_DATA_TYPE -#undef CONVERT_STR -#undef CONVERT -#undef CONVERT_SAT_STR -#undef CONVERT_SAT - -#define VEC_DATA_TYPE_STR(type, size) type##x##size -#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) - -#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x))) -#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype) -#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE) -#define CONVERT(x, type) CONVERT_STR(x, type) - -#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x))) -#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype) -#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) - -/** Computes saturating absolute value of fixed point vector. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point absolute value. - */ -#define ABSQ_SAT_IMPL(type) \ - inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); } - -ABSQ_SAT_IMPL(qs8x16) -ABSQ_SAT_IMPL(qs16x8) - -#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a)) -#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size) - -/** Computes max of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point maximum. - */ -#define MAXQ_IMPL(type) \ - inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); } - -MAXQ_IMPL(qs8x1) -MAXQ_IMPL(qs8x2) -MAXQ_IMPL(qs8x4) -MAXQ_IMPL(qs8x8) -MAXQ_IMPL(qs8x16) -MAXQ_IMPL(qs16x1) -MAXQ_IMPL(qs16x2) -MAXQ_IMPL(qs16x4) -MAXQ_IMPL(qs16x8) -MAXQ_IMPL(qs16x16) - -#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b)) -#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size) - -/** Computes saturated addition of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point addition. The result is saturated in case of overflow - */ -#define ADDQ_SAT_IMPL(type) \ - inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); } - -ADDQ_SAT_IMPL(qs8x1) -ADDQ_SAT_IMPL(qs8x2) -ADDQ_SAT_IMPL(qs8x4) -ADDQ_SAT_IMPL(qs8x8) -ADDQ_SAT_IMPL(qs8x16) -ADDQ_SAT_IMPL(qs16x1) -ADDQ_SAT_IMPL(qs16x2) -ADDQ_SAT_IMPL(qs16x4) -ADDQ_SAT_IMPL(qs16x8) -ADDQ_SAT_IMPL(qs16x16) -ADDQ_SAT_IMPL(qs32x1) -ADDQ_SAT_IMPL(qs32x2) -ADDQ_SAT_IMPL(qs32x4) -ADDQ_SAT_IMPL(qs32x8) -ADDQ_SAT_IMPL(qs32x16) - -#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b)) -#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size) - -/** Computes saturated subtraction of fixed point types. - * - * @param[in] type the actual data type. - * - * @return The result of the fixed point subtraction. The result is saturated in case of overflow - */ -#define SUBQ_SAT_IMPL(type) \ - inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); } - -SUBQ_SAT_IMPL(qs8x1) -SUBQ_SAT_IMPL(qs8x2) -SUBQ_SAT_IMPL(qs8x4) -SUBQ_SAT_IMPL(qs8x8) -SUBQ_SAT_IMPL(qs8x16) -SUBQ_SAT_IMPL(qs16x1) -SUBQ_SAT_IMPL(qs16x2) -SUBQ_SAT_IMPL(qs16x4) -SUBQ_SAT_IMPL(qs16x8) -SUBQ_SAT_IMPL(qs16x16) - -#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b)) -#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size) - -/* Multiply of two fixed point numbers - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiplication. - */ -#define MULQ_IMPL(type, itype) \ - inline type mul_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype round_val = (itype)(1 << (fixed_point_position - 1)); \ - itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \ - return CONVERT((res >> (itype)fixed_point_position), type); \ - } - -MULQ_IMPL(qs8x8, qs16x8) -MULQ_IMPL(qs16x8, qs32x8) -MULQ_IMPL(qs8x16, qs16x16) -MULQ_IMPL(qs16x16, qs32x16) - -#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position)) -#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position) - -/* Saturate multiply of two fixed point numbers - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiplication. The result is saturated in case of overflow - */ -#define MULQ_SAT_IMPL(type, itype) \ - inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype round_val = (itype)(1 << (fixed_point_position - 1)); \ - itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \ - return CONVERT_SAT((res >> (itype)fixed_point_position), type); \ - } - -MULQ_SAT_IMPL(qs8x1, qs16x1) -MULQ_SAT_IMPL(qs8x2, qs16x2) -MULQ_SAT_IMPL(qs8x3, qs16x3) -MULQ_SAT_IMPL(qs8x4, qs16x4) -MULQ_SAT_IMPL(qs8x8, qs16x8) -MULQ_SAT_IMPL(qs8x16, qs16x16) -MULQ_SAT_IMPL(qs16x1, qs32x1) -MULQ_SAT_IMPL(qs16x2, qs32x2) -MULQ_SAT_IMPL(qs16x3, qs32x3) -MULQ_SAT_IMPL(qs16x4, qs32x4) -MULQ_SAT_IMPL(qs16x8, qs32x8) -MULQ_SAT_IMPL(qs16x16, qs32x16) - -#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \ - mul_sat_##type##x##size((a), (b), (position)) -#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \ - MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) - -/** Saturate multiply-accumulate - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiply-accumulate. The result is saturated in case of - * overflow - */ -#define MLAQ_SAT_IMPL(type, itype) \ - type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \ - { \ - itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \ - (itype)(1 << (fixed_point_position - 1))); \ - return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \ - } - -MLAQ_SAT_IMPL(qs8x8, qs16x8) -MLAQ_SAT_IMPL(qs8x16, qs16x16) -MLAQ_SAT_IMPL(qs16x8, qs32x8) - -#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \ - mla_sat_##type##x##size((a), (b), (c), (position)) -#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \ - MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) - -/** Saturate multiply-accumulate long - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiply-accumulate long. The result is saturated in case - * of overflow - */ -#define MLALQ_SAT_IMPL(type, itype) \ - itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \ - { \ - itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \ - (itype)(1 << (fixed_point_position - 1))); \ - return add_sat(VopA, res >> (itype)fixed_point_position); \ - } - -MLALQ_SAT_IMPL(qs8x8, qs16x8) -MLALQ_SAT_IMPL(qs16x8, qs32x8) - -#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \ - mlal_sat_##type##x##size((a), (b), (c), (position)) -#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \ - MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) - -/** Saturate division of two fixed point vectors - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point division. The result is saturated in case of overflow - */ -#define DIVQ_SAT_IMPL(stype, type, itype) \ - inline type div_sat_##type(type VopA, type VopB, int fixed_point_position) \ - { \ - itype conv_a = CONVERT((VopA), itype); \ - itype denominator = CONVERT((VopB), itype); \ - itype numerator = conv_a << (itype)(fixed_point_position); \ - itype res = select((itype)(numerator / denominator), \ - select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \ - (itype)(denominator == (itype)0)); \ - return CONVERT_SAT((res), type); \ - } - -DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16) -DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8) -DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16) -DIVQ_SAT_IMPL(qs8, qs8, qs16) -DIVQ_SAT_IMPL(qs16, qs16, qs32) - -#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position)) -#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position) - -#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \ - div_sat_##type##x##size((a), (b), (position)) -#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \ - DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) - -/** Saturate exponential of a fixed point vector - * - * @note Implemented approach uses taylor polynomial to approximate the exponential function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point exponential. The result is saturated in case of overflow - */ -#define EXPQ_IMPL(stype, type, size) \ - inline type exp_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1); \ - type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one; \ - type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1); \ - type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1); \ - type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1); \ - type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1); \ - type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position); \ - type dec_m = m >> (type)fixed_point_position; \ - type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, \ - fixed_point_position); \ - alpha = CONVERT(abs_diff(VopA, alpha), type); \ - type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A); \ - sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \ - return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), \ - clz(sum) > dec_m); /* Saturate result if needed */ \ - } - -EXPQ_IMPL(qs8, qs8x2, 2) -EXPQ_IMPL(qs8, qs8x4, 4) -EXPQ_IMPL(qs8, qs8x8, 8) -EXPQ_IMPL(qs8, qs8x16, 16) -EXPQ_IMPL(qs16, qs16x2, 2) -EXPQ_IMPL(qs16, qs16x4, 4) -EXPQ_IMPL(qs16, qs16x8, 8) -EXPQ_IMPL(qs16, qs16x16, 16) - -#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position)) -#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position) - -/** Saturate logarithm of a fixed point vector - * - * @note Implemented approach uses taylor polynomial to approximate the logarithm function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point logarithm. The result is saturated in case of overflow - */ -#define LOGQ_IMPL(stype, type, size) \ - inline type log_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */ \ - type A = (type)(0x5C0F >> (14 - fixed_point_position)); /* 1.4384189 */ \ - type B = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */ \ - type C = (type)(0x2933 >> (15 - fixed_point_position)); /* 0.3218538 */ \ - type D = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */ \ - type inter_a = \ - select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \ - VopA < const_one); \ - type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position); \ - inter_a = inter_a >> shift_val; \ - inter_a = sub_sat(inter_a, const_one); \ - type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C); \ - sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B); \ - sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A); \ - sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position); \ - sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, \ - size, fixed_point_position); \ - return select(select(sum, -sum, VopA < const_one), (type)0, \ - VopA < (type)0); /* Saturate result if needed */ \ - } - -LOGQ_IMPL(qs8, qs8x16, 16) -LOGQ_IMPL(qs16, qs16x8, 8) -LOGQ_IMPL(qs16, qs16x16, 16) - -#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position)) -#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position) - -/** Saturate inverse square root of a fixed point vector - * - * @note Implemented approach uses Newton's method to approximate the inverse square root function. - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point inverse square root. The result is saturated in case of - * overflow - */ -#define INVSQRTQ_IMPL(stype, type, size) \ - inline type invsqrt_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_three = (type)(3 << (fixed_point_position)); \ - type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \ - type temp = select((type)(VopA >> shift_value), \ - select((type)stype##_MAX, (type)(VopA << (-shift_value)), \ - (type)(clz(VopA) > (-shift_value))), \ - (type)(shift_value < (type)0)); \ - type x = temp; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */ \ - { \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - x = MUL_SAT_OP_EXPAND( \ - x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ - fixed_point_position), \ - temp, stype, size, fixed_point_position)), \ - stype, size, fixed_point_position) >> \ - 1; \ - } \ - type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \ - return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), \ - (type)(clz(x) > shift_value2)), \ - (type)(shift_value < (type)0)); /* Saturate result if needed */ \ - } - -INVSQRTQ_IMPL(qs8, qs8x1, 1) -INVSQRTQ_IMPL(qs16, qs16x1, 1) -INVSQRTQ_IMPL(qs8, qs8x16, 16) -INVSQRTQ_IMPL(qs16, qs16x8, 8) - -#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position)) -#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position) - -/** Saturate hyperbolic tangent of a fixed point vector - * - * tanh(x) = (e^2x - 1)/(e^2x + 1) - * - * @param[in] stype the actual scalar data type. - * @param[in] type the actual data type. - * @param[in] size the number of the calculated elements. - * - * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of - * overflow - */ -#define TANHQ_IMPL(stype, type, size) \ - inline type tanh_sat_##type(type VopA, int fixed_point_position) \ - { \ - type const_one = (type)(1 << (fixed_point_position)); \ - type const_two = (type)(2 << (fixed_point_position)); \ - type exp2x = \ - EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \ - stype, size, fixed_point_position); \ - type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size); \ - type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size); \ - return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position); \ - } - -TANHQ_IMPL(qs8, qs8x16, 16) -TANHQ_IMPL(qs16, qs16x8, 8) - -#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position)) -#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position) - -#define floatx16 float16 -#define float16_TYPE float16 - -#define CONVERTQ_DOWN_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \ - { \ - return CONVERT(a * (1 << fixed_point_position) + \ - select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \ - out_type); \ - } - -CONVERTQ_DOWN_IMPL(float16, qs8x16) -CONVERTQ_DOWN_IMPL(float16, qs16x16) - -#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \ - { \ - return CONVERT_SAT(a * (1 << fixed_point_position) + \ - select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \ - out_type); \ - } - -CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16) -CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16) - -#define CONVERTQ_UP_IMPL(in_type, out_type) \ - inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \ - { \ - return CONVERT(a, out_type) / (1 << fixed_point_position); \ - } - -CONVERTQ_UP_IMPL(qs8x16, float16) -CONVERTQ_UP_IMPL(qs16x16, float16) - -#define SQCVT_SAT_IMPL(type) \ - inline type sqcvt_##type##_sat(float a, int fixed_point_position) \ - { \ - return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \ - } - -SQCVT_SAT_IMPL(qs8) -SQCVT_SAT_IMPL(qs16) - -#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position)) -#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position) - -#endif // ARM_COMPUTE_FIXED_POINT_H diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl index 25e20f5f2..6b767d6c9 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl new file mode 100644 index 000000000..ed7409852 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform hashtable_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector + */ +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) + :get_global_id(0); + lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) + :get_global_id(1); + lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) + :get_global_id(2)%DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + :get_global_id(2) / DEPTH_OUT; + + if (lup_id[NUM_DIMS-1] < 0) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); + return; + } + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h index 8143d2398..0e123ae0a 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -24,15 +24,23 @@ #ifndef ARM_COMPUTE_HELPER_H #define ARM_COMPUTE_HELPER_H -#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) -#if defined(ARM_COMPUTE_DEBUG_ENABLED) -#if defined(cl_arm_printf) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) #pragma OPENCL EXTENSION cl_arm_printf : enable -#endif // defined(cl_arm_printf) -#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) #define EXPAND(x) x @@ -175,7 +183,7 @@ typedef struct Tensor4D * * @return An image object */ -Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, +inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) { Vector vector = { @@ -201,7 +209,7 @@ Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ * * @return An image object */ -Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) { Image img = {.ptr = ptr, @@ -230,7 +238,7 @@ Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el * * @return A 3D tensor object */ -Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) @@ -261,7 +269,7 @@ Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, * * @return A 3D tensor object */ -Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) @@ -276,7 +284,7 @@ Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, return tensor; } -Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, uint step_w, uint mod_size) @@ -299,7 +307,7 @@ Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, * @param[in] vec Pointer to the starting position of the buffer * @param[in] x Relative X position */ -__global inline const uchar *vector_offset(const Vector *vec, int x) +inline __global const uchar *vector_offset(const Vector *vec, int x) { return vec->ptr + x * vec->stride_x; } @@ -310,7 +318,7 @@ __global inline const uchar *vector_offset(const Vector *vec, int x) * @param[in] x Relative X position * @param[in] y Relative Y position */ -__global inline uchar *offset(const Image *img, int x, int y) +inline __global uchar *offset(const Image *img, int x, int y) { return img->ptr + x * img->stride_x + y * img->stride_y; } @@ -322,7 +330,7 @@ __global inline uchar *offset(const Image *img, int x, int y) * @param[in] y Relative Y position * @param[in] z Relative Z position */ -__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) { return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; } @@ -335,7 +343,7 @@ __global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int * @param[in] z Relative Z position * @param[in] w Relative W position */ -__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) +inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) { return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl new file mode 100644 index 000000000..e3aa463db --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Performs a negation of input tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * + * @param[in] in_ptr Pointer to the source image. Supported data types: S16/S32/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + */ +__kernel void neg_tensor( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl new file mode 100644 index 000000000..ecf4696e9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * + * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for + * batch), Top(for height), Left(for width) and Front(for depth) are + * required. Supported data type: S32 + */ + +__kernel void pad( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int4 pad_values) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int index[4]={0}; + + index[0] = get_global_id(0);//W + index[1] = get_global_id(1);//H + index[2] = get_global_id(2) % DEPTH_OUT;//C + index[3] = get_global_id(2) / DEPTH_OUT;//N + + if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) || + index[1] < pad_values.y || index[1] >= (IH + pad_values.y) || + index[2] < pad_values.z || index[2] >= (ID + pad_values.z) || + index[3] < pad_values.w || index[3] >= (IB + pad_values.w)) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *) + tensor4D_offset(&in, index[0] - pad_values.x, + index[1] - pad_values.y, + index[2] - pad_values.z, + index[3] - pad_values.w)); + } + } + +#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl new file mode 100644 index 000000000..7cc8b0354 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) +/** Perform a Generic permute operation on an input tensor of Shape DCHW. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1 +6/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in b +ytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in b +ytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in b +ytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void permute_generic( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4]; + int in_index[4]; + in_index[0] = get_global_id(0);//W + in_index[1] = get_global_id(1);//H + in_index[2] = get_global_id(2) % DEPTH_IN;//C + in_index[3] = get_global_id(2) / DEPTH_IN;//B + out_index[0] = in_index[P1]; + out_index[1] = in_index[P2]; + out_index[2] = in_index[P3]; + out_index[3] = in_index[P4]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl index 512c62023..aa05121b1 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl index 82edf3b1d..fdfb78003 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl @@ -2,40 +2,20 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" -#if defined(FIXED_POINT_POSITION) - -#include "fixed_point.h" - -#if defined(SATURATE) -#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) -#else // SATURATE -#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) -#endif // SATURATE - -#else // FIXED_POINT_POSITION - #if defined(SATURATE) #define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x)) #else // SATURATE @@ -45,17 +25,14 @@ #define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size) -#endif // FIXED_POINT_POSITION - /** Performs a pixelwise division with integer scale of integer inputs. * * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. - * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3 * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16 + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/S16 * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) @@ -79,7 +56,7 @@ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1). + * @param[in] scale Integer scaling factor. Supported data types: S32 */ __kernel void pixelwise_div_int( TENSOR3D_DECLARATION(in1), diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl index ddc9d5a27..ab1307e64 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2016, 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers_asymm.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl new file mode 100644 index 000000000..68da2ba32 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data types : F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] alpha_ptr Pointer to the source image. Supported Data types : F16/F32 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void prelu( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ? + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) : + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), + 0, (__global DATA_TYPE *)output.ptr); + +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl new file mode 100644 index 000000000..7e97b7ed6 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" +#define SUB(x, y) (x) - (y) + +#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) + +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take uchar data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data types : QASYMM8 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] alpha_ptr Pointer to the source image. Supported Data types : QASYMM8 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void prelu_qasymm8( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); + VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); + + in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1)); + in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2)); + + const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); + const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); + const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32; + const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); + const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); + + VSTORE(VEC_SIZE) + (res, 0, (__global uchar *)output.ptr); +} + +#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl deleted file mode 100644 index dfa3b85f4..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#if defined(WIDTH) -/** Perform reduce max - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void reduce_max(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(output)) -{ - Vector input = CONVERT_TO_VECTOR_STRUCT(input); - Vector output = CONVERT_TO_VECTOR_STRUCT(output); - - __global float *input_addr = (__global float *)(input.ptr); - __global float *output_addr = (__global float *)(output.ptr); - - float max_value = *input_addr; - for(int x = 1; x < WIDTH; x++) - { - float value = *(input_addr + x); - max_value = max(value, max_value); - } - - // Store max - *output_addr = max_value; -} -#endif // defined(WIDTH) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl new file mode 100644 index 000000000..8bef49363 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform reduce max/min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + for(int i = 1; i < dim; ++i) + { + indices[axis] = i; + + #if OP_CODE == 1 // REDUCE_MAX + value = max(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + + #elif OP_CODE == 2 // REDUCE_MIN + value = min(value, *((__global DATA_TYPE *) + tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); + + #else // OP NOT SUPPORTED + return; + + #endif + } + + *((__global DATA_TYPE *)out.ptr) = value; +} + +/** Perform reduce sum/mean + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using + * -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = + { + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE sum_value = (DATA_TYPE)0; + for(int i = 0; i < dim; ++i) + { + indices[axis] = i; + sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + } + + #if OP_CODE == 3 // REDUCE_SUM + *((__global DATA_TYPE *)out.ptr) = sum_value; + + #elif OP_CODE == 4 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); + + #else // OP NOT SUPPORTED + return; + + #endif +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl deleted file mode 100644 index 1a96eea61..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -inline DATA_TYPE sum_8(__global const DATA_TYPE *input) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - in = vload8(0, input); - in.s0123 += in.s4567; - in.s01 += in.s23; - return ((in.s0 + in.s1)); -} - -/** This function calculates the sum and sum of squares of a given input image. - * - * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] local_sum Local sum of all elements - * @param[in] height Height of the input image - * @param[in] divider Divider to calculate mean - */ -__kernel void reduction_mean( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst), - __local DATA_TYPE *local_sums, - int height, - int divider) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - float8 tmp_sum = 0; - // Calculate partial sum - - for(int i = 0; i < height; i++) - { - local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i)); - } - ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider; -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl new file mode 100644 index 000000000..a0fc2d5a9 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) +/** Perform space to batch with input of 4D and NCHW format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = {0, }; + in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[2] = get_global_id(2) % DEPTH_OUT; + in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; + + if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); + } +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) + +#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) +/** Perform space to batch with input of 4D and NHWC format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = {0, }; + in_index[0] = get_global_id(0) * VEC_SIZE; + in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; + + if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); + } + else + { + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); + } +} + +#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl new file mode 100644 index 000000000..f6977045a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_depth( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4]={0}; + int in_index[4]={0}; + + in_index[0] = get_global_id(0);//W + in_index[1] = get_global_id(1);//H + in_index[2] = get_global_id(2) % DEPTH_IN;//C + in_index[3] = get_global_id(2) / DEPTH_IN;//B + + out_index[0] = in_index[0]/BLOCK_SIZE; + out_index[1] = in_index[1]/BLOCK_SIZE; + out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; + out_index[3] = in_index[3]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl new file mode 100644 index 000000000..3e1a5c97f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Returns true value of squared_difference of two tensors. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[in] input2_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source image + * + * @param[out] output_ptr Pointer to the destination image. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void squared_difference( + TENSOR3D_DECLARATION(input1), + TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + sq_diff = diff * diff; + + VSTORE(VEC_SIZE) + (sq_diff, 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl deleted file mode 100644 index c5ff82f9e..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - - -inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w) -{ - int stride_x = vector->stride_x; - int stride_y = stride_x * dim_x; - int stride_z = stride_y * dim_y; - int stride_w = stride_z * dim_z; - Tensor4D tensor = - { - .ptr = vector->ptr, - .offset_first_element_in_bytes = vector->offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z, - .stride_w = stride_w, - }; - return tensor; -} - -/** Extracts a strided slice up to 4-dimensions - * - * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short - * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2 - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] dims_in The 4-dimensional dimension of the input. Supported data types: S32 - * @param[in] dims_out The 4-dimensional dimension of the output. Supported data types: S32 - * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 - * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 - */ -__kernel void strided_slice(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(output), - const int4 dims_in, - const int4 dims_out, - const int4 starts, - const int4 strides) -{ - // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset - Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); - Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); - - // Implemenation - // Infer a Tensor4D from output Vector and output's dimensions info - // Infer a Tensor4D from input Vector and input's dimensions info - // Infer indices of output as 4D from the offset of output vector - // Infer indices of input as 4D from indices of output - // out(offset of output vector) = in(offset of input) - - Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w); - Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w); - - // Must be output_step_x == output_stride_x == an element's size - const int offset_out = get_global_id(0) * output_stride_x; - int4 indices_out = - { - get_global_id(0) % dims_out.x, - (offset_out / tensor_out.stride_y) % dims_out.y, - (offset_out / tensor_out.stride_z) % dims_out.z, - (offset_out / tensor_out.stride_w) % dims_out.w, - }; - - int4 indices_in = - { - starts.x + (strides.x * indices_out.x), - starts.y + (strides.y * indices_out.y), - starts.z + (strides.z * indices_out.z), - starts.w + (strides.w * indices_out.w), - }; - - *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); -} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl new file mode 100644 index 000000000..b39c55b96 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) +/** Extracts a strided slice up to 4-dimensions + * + * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 + * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 + */ +__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int4 starts, + const int4 strides) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int4 indices_in = + { + starts.x + (strides.x * get_global_id(0)), + starts.y + (strides.y * get_global_id(1)), + starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)), + starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)), + }; + *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); +} +#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl index 0b0cf8218..d97f23a47 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl index deadf8412..0292fab04 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "helpers.h" diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl index cac0c071e..c2c2d89a4 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -2,25 +2,17 @@ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved * Copyright (c) 2017 ARM Limited. * - * SPDX-License-Identifier: MIT + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: + * http://www.apache.org/licenses/LICENSE-2.0 * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ // reference: diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp new file mode 100644 index 000000000..1fdd2f98f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/UtilsEx.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::F16, DataType::F32); + + // Checks performed when output is configured + if ((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + if (output != nullptr) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, *input); + } + + const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + bool window_changed = false; + + if (output != nullptr) + { + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + else + { + window_changed = update_window_and_padding( + win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); + } + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLActivationLayerExKernel::CLActivationLayerExKernel() + : _input(nullptr), _output(nullptr), _run_in_place(false) +{ +} + +void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output, + ActivationLayerInfoEx act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _run_in_place = (output == nullptr) || (output == input); + + if (output != nullptr) + { + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), *input->info()->clone()); + } + + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info)); + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const DataType dt = input->info()->data_type(); + float a_const = act_info.a(); + float b_const = act_info.b(); + int a_const_int = 0; + int b_const_int = 0; + + // Create quantized version of constants a, b if needed + if (is_data_type_quantized(dt)) + { + a_const_int = + input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP); + b_const_int = + input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP); + } + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace( + ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation())))); + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized(dt)) + { + build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int))); + build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int))); + + const int o1 = input->info()->quantization_info().offset; + // Quantized value of 0 corresponds to the offset o1 + build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1))); + + // Set scale and offset of the input and output if they have different quantization info + if (is_data_type_quantized_asymmetric(dt) && output != nullptr) + { + const float s1 = input->info()->quantization_info().scale; + const float s2 = output->info()->quantization_info().scale; + const int o2 = output->info()->quantization_info().offset; + + if (o1 != o2 || s1 != s2) + { + build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1))); + build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2))); + build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1))); + build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2))); + } + } + } + else + { + build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); + build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); + } + + build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : ""); + + // Create kernel + std::string kernel_name = std::string("activation_layer_ex"); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Make sure _kernel is initialized before calling the parent's configure + _input = input; + _output = output; + + // Configure kernel window + auto win_config = + validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "activation_layer_ex_"; + _config_id += lower_string(string_from_data_type(dt)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info) +{ + const bool run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), + (run_in_place) ? nullptr : output->clone().get()) + .first); + + return Status{}; +} + +void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + if (!_run_in_place) + { + add_3D_tensor_argument(idx, _output, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp new file mode 100644 index 000000000..c1a2ad0be --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(argminmax_axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32, + DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match argminmax_axis"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + argminmax_axis >= 0 && argminmax_axis < num_dimensions, + "argminmax_axis must be greater than or equal to 0 and less than (input's rank)."); + return Status{}; +} + +} // namespace + +CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {} + +void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis)); + + _input = input; + _output = output; + _argminmax_axis = argminmax_axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis)); + + // Construct kernel name for argmax and argmin based on axis + std::string kernel_name = "arg_op"; + int op_code = 0; + if (op == ArgOperation::MAX) + { + op_code = 1; + } + else if (op == ArgOperation::MIN) + { + op_code = 2; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t argminmax_axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op)); + + return Status{}; +} + +void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _argminmax_axis); + _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp new file mode 100644 index 000000000..1c505b4d5 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->data_type() == DataType::U8 && + (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, + ITensorInfo *output) +{ + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output, out_shape); + + if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) + { + set_format_if_unknown(*output, Format::S16); + } + else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16) + { + set_format_if_unknown(*output, Format::F16); + } + else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) + { + set_format_if_unknown(*output, Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2); + + AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input1->info(), input2->info(), output->info(), policy)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input1 = input1; + _input2 = input2; + _output = output; + + const bool has_float_out = is_data_type_float(output->info()->data_type()); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts)); + + ICLKernel::configure_internal(win_config.second); +} + +Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), + input2->clone().get(), + output->clone().get()) + .first); + + return Status{}; +} + +void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLArithmeticSubtractionExKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp new file mode 100644 index 000000000..b0016d23c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t *block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2), + "Input Depth should be equal to Output Depth"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3), + "Input batch should be equal to (output batch * block size[0] *block size[1])"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) && + !(output->dimension(1) % block_size[0]), + "Output height and width should be divisible by block size[0] " + "and block_size[1] respectively"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) && + (output->dimension(1) == input->dimension(1) * block_size[0]), + "Output height and width should be equal to " + "input_height*blocksize[0] and input_width*blocksize[1] " + "respectively"); + + return Status{}; +} + +} // namespace + +CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {} + +void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t *block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0])); + build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1])); + build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3))); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_out); + add_4D_tensor_argument(idx, _output, slice_in); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp new file mode 100644 index 000000000..3d2f2c702 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, BinaryLogicalOperation op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "binary_logical_op"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + + int op_code = 0; + switch (op) + { + case BinaryLogicalOperation::AND: + op_code = 1; + break; + case BinaryLogicalOperation::OR: + op_code = 2; + break; + default: + throw std::runtime_error("Operation not supported, yet"); + } + + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLBinaryLogicalOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp index b019e8c33..bf7ebae3f 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp @@ -17,15 +17,8 @@ #include "arm_compute/core/CL/kernels/CLCastKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" using namespace arm_compute; @@ -60,8 +53,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) { const float scale_in = input->info()->quantization_info().scale; const int offset_in = input->info()->quantization_info().offset; - build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); + build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); _kernel = static_cast<cl::Kernel>( CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts)); @@ -70,8 +63,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) { const float scale_in = output->info()->quantization_info().scale; const int offset_in = output->info()->quantization_info().offset; - build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); - build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); + build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in)); _kernel = static_cast<cl::Kernel>( CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts)); @@ -88,7 +81,7 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) update_window_and_padding(win, input_access, output_access); output_access.set_valid_region(win, input->info()->valid_region()); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp new file mode 100644 index 000000000..5af5b16ea --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, const ComparisonOperation &op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "comparison_op"; + int op_code = 0; + + switch (op) + { + case ComparisonOperation::EQUAL: + op_code = 1; + break; + case ComparisonOperation::NOT_EQUAL: + op_code = 2; + break; + default: + throw std::runtime_error(" Operation not supported, yet"); + } + + std::set<std::string> build_opts; + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type()))); + build_opts.emplace( + ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input1->info()->data_type()) && + ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) || + (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale))) + { + build_opts.emplace("-DOFFSET_IN1=" + + support::cpp11::to_string(input1->info()->quantization_info().offset)); + build_opts.emplace("-DOFFSET_IN2=" + + support::cpp11::to_string(input2->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN1=" + + support::cpp11::to_string(input1->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_IN2=" + + support::cpp11::to_string(input2->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input1->info()->data_type() == DataType::S16 || + input2->info()->data_type() == DataType::S16) + { + set_format_if_unknown(*output->info(), Format::S16); + } + else if (input1->info()->data_type() == DataType::F16 && + input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input1->info()->data_type() == DataType::F32 || + input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLComparisonOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp new file mode 100644 index 000000000..c386e3312 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size, + "Output width should be equal to (Input width * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size, + "Output height should be equal to (Input height * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0, + "Input depth should be divisible by (block size * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(2) == input->dimension(2) / (block_size * block_size), + "Output depth should be equal to (Input depth / (block size * block size))"); + + return Status{}; +} +} // namespace + +CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..0862b78bf --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + return Status{}; +} + +void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp new file mode 100644 index 000000000..b1ee21bdc --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLExpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {} + +void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), + input->info()->quantization_info()); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 4; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLExpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp index 23efafa6a..ae2801e2b 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp @@ -17,26 +17,14 @@ #include "arm_compute/core/CL/kernels/CLGatherKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> using namespace arm_compute; namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; +constexpr unsigned int num_elems_processed_per_iteration = 1; Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) @@ -46,6 +34,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); return Status{}; } @@ -57,8 +46,7 @@ CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(n void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); _input1 = input1; _input2 = input2; @@ -89,11 +77,10 @@ void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); // Configure kernel window - const unsigned int num_elems_processed_per_iteration = 1; Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration)); output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp new file mode 100644 index 000000000..cd7b21c6d --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLHashtableLookupKernel::CLHashtableLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Output's shape was not set"); + + ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) && + output->dimension(output->num_dimensions() - 1) == lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + + return Status{}; +} + +void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Make _lookup_indices tensor + _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const_cast<ICLTensor *>(_lookups)->map(queue); + const_cast<ICLTensor *>(_keys)->map(queue); + _hits->map(queue); + _lookup_indices->map(queue); + + // Set values of hits + const int32_t *lookups_buf = + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); + + std::map<int32_t, size_t> key_map; + const size_t keys_num = _keys->info()->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const size_t lookups_num = _lookups->info()->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { +#if defined(DEBUG) + if (it->second >= lookups_num) + ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); +#endif // defined(DEBUG) + lookup_indices_buf[i] = static_cast<int32_t>(it->second); + hits_buf[i] = static_cast<uint8_t>(1); + } + else + { + lookup_indices_buf[i] = -1; + hits_buf[i] = static_cast<uint8_t>(0); + } + } + + const_cast<ICLTensor *>(_lookups)->unmap(queue); + const_cast<ICLTensor *>(_keys)->unmap(queue); + _hits->unmap(queue); + _lookup_indices->unmap(queue); + + Window win = window.collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp new file mode 100644 index 000000000..80d99dd3b --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(), + output->info()->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + return Status{}; +} + +} // namespace + +CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} + +void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp new file mode 100644 index 000000000..12bbe910f --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + + // Checks performed when output is configured + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, *input->clone()); + + const unsigned int norm_size = norm_info.norm_size(); + bool is_in_map = norm_info.is_in_map(); + + const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0; + const BorderSize border_size = BorderSize(0, border_width); + + const unsigned int num_elems_processed_per_iteration = 4; + const unsigned int num_elems_read_per_iteration = + is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) + : num_elems_processed_per_iteration; + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside + // the kernel, avoiding padding + AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLNormalizationLayerExKernel::CLNormalizationLayerExKernel() + : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false) +{ +} + +BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; } + +void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), *input->info()->clone()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info)); + + _input = input; + _output = output; + + const unsigned int num_elems_processed_per_iteration = 4; + const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option( + ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff()))); + build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta()))); + build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa()))); + build_opts.add_option( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size()))); + build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2)))); + build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D"); + + // Create kernel + std::string kernel_name = + _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "normalization_layer_"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string( + static_cast<std::underlying_type<NormType>::type>(norm_info.type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(norm_info.norm_size()); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); + + return Status{}; +} + +void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const int collapsed_dimension = _is_in_map ? Window::DimZ : 4; + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension); + Window slice = window_collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } while (window_collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp new file mode 100644 index 000000000..241f8ae4d --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info())); + + _input = input; + _alpha = alpha; + _output = output; + + // Create kernel + std::string kernel_name = "prelu"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + build_opts.emplace("-DOFF_IN1=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_IN2=" + + support::cpp11::to_string(alpha->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN1=" + + support::cpp11::to_string(input->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_IN2=" + + support::cpp11::to_string(alpha->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input->info()->data_type() == DataType::F32 || + alpha->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); + + AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input->info()->tensor_shape(); + const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_input1); + add_3D_tensor_argument(idx, _alpha, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLPReLUKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp new file mode 100644 index 000000000..99b54c822 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info, + const ITensorInfo *pad_size_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 && + input_info->num_dimensions() <= 4, + "Pad kernel supports upto 4-D input tensor"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input_info->num_dimensions() == output_info->num_dimensions(), + "output tensor should have same number of dimensions as input tensor"); + + if (input_info->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() != + output_info->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {} + +void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info())); + + _input = input; + _output = output; + _pad_size = pad_size; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3))); + build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + _pad_size->map(queue); + + // Padding values only for up, top, left and front are required based on the rank of tensor + int rank = _pad_size->info()->dimension(1); + + auto pad_batch_up = + (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0; + auto pad_height_top = + (rank >= 2) + ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1})) + : 0; + auto pad_width_left = (rank >= 1) + ? *reinterpret_cast<const int32_t *>( + _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1})) + : 0; + auto pad_depth_front = + (rank >= 3) + ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3})) + : 0; + + _pad_size->unmap(queue); + + // Pad_values which needs to be passed + const cl_int4 paddingValues = { + {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top), + static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}}; + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + _kernel.setArg<cl_int4>(idx++, paddingValues); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp new file mode 100644 index 000000000..aa094761c --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +using namespace arm_compute; + +namespace +{ +TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm) +{ + TensorShape output_shape = input->tensor_shape(); + permute(output_shape, perm); + return output_shape; +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + const TensorShape output_shape = + misc::shape_calculator::compute_permutation_output_shape(*input, perm); + + // Validate configured output + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + return Status{}; +} +} // namespace + +CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {} + +void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm)); + + _input = input; + _output = output; + _perm = perm; + + const TensorShape output_shape = get_output_shape(input->info(), perm); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + + // Create kernel + std::set<std::string> build_opts; + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + + // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector + build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0])); + build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1])); + build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2])); + build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3])); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + // The CLPermute doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm)); + + return Status{}; +} + +void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp index a3e0163de..b985aa737 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp @@ -17,20 +17,8 @@ #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> using namespace arm_compute; @@ -45,12 +33,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, DataType::F16, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); const TensorShape &out_shape = @@ -58,21 +44,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2); - - if (is_data_type_fixed_point(input1->data_type())) - { - // All data types must be all QS8 or all QS16 - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1, - "Unsupported scaling factor for QS8/QS16. Scale must be 1."); - } // Validate in case of configured output if (output->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, - DataType::QS16, DataType::S16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG( output->data_type() == DataType::U8 && @@ -81,11 +57,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG( detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output); - if (is_data_type_fixed_point(input1->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); - } } return Status{}; @@ -191,14 +162,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens { compute_type = "int"; } - else if (input1->info()->data_type() == DataType::QS8) - { - compute_type = "qs8"; - } - else if (input1->info()->data_type() == DataType::QS16) - { - compute_type = "qs16"; - } else { compute_type = "ushort"; @@ -218,11 +181,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens : "-DSATURATE"); build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte"); - if (is_data_type_fixed_point(input1->info()->data_type())) - { - build_opts.emplace("-DFIXED_POINT_POSITION=" + - support::cpp11::to_string(input1->info()->fixed_point_position())); - } build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); @@ -245,7 +203,7 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens _kernel.setArg(idx++, scale); } - ICLKernel::configure(win_config.second); + ICLKernel::configure_internal(win_config.second); } Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp deleted file mode 100644 index 168b246bf..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2016-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <cmath> -#include <cstdlib> -#include <set> -#include <string> - -using namespace arm_compute; - -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; - -Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) -{ - // We can handle for simple case only - // Input rank: 2 - // Output rank: 1 - // Axis: one axis value, restrict to 1 - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, - "Inputs are not broadcast compatible"); - - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(), - "Output same type allowed for input and output"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1, - "Only support for output dimension 1"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2, - "Only support for input dimension 2"); - } - - return Status{}; -} - -} // namespace - -CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {} - -void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info())); - - _input = input; - _output = output; - _axis = axis; - - // Configure kernel window - int cols = _input->info()->tensor_shape()[0]; - int rows = _input->info()->tensor_shape()[1]; - Window win; - win.set(0, Window::Dimension(0, cols, 1)); - win.set(1, Window::Dimension(0, rows, 1)); - - // Construct kernel name - std::string kernel_name = "reduce_max"; - - // Set kernel build options - std::set<std::string> build_opts; - build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols)); - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - - ICLKernel::configure(win); -} - -Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis, - const ITensorInfo *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output)); - - return Status{}; -} - -void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window window_input = window; - Window slice_input = window_input.first_slice_window_1D(); - - do - { - Window slice_output = slice_input.shift_dimensions(1); - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, slice_input); - add_1D_tensor_argument(idx, _output, slice_output); - enqueue(queue, *this, slice_input); - - } while (window_input.slide_window_slice_1D(slice_input)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp new file mode 100644 index 000000000..f581780e1 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; +namespace +{ +// NOTE This is necessary because it is not guaranteed that the axis positions of input and output +// are the same. +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32, DataType::S32); + if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, + "Not support QASYMM8, yet"); + } + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + axis >= 0 && axis < num_dimensions, + "axis must be greater than or equal to 0 and less than (input's rank)."); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + return Status{}; +} +} // namespace + +CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel name + std::string kernel_name; + int op_code = 0; + if (op == ReduceOperation::MAX) + { + kernel_name = "reduce_min_max"; + op_code = 1; + } + else if (op == ReduceOperation::MIN) + { + kernel_name = "reduce_min_max"; + op_code = 2; + } + else if (op == ReduceOperation::SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 3; + } + else if (op == ReduceOperation::MEAN) + { + kernel_name = "reduce_sum_mean"; + op_code = 4; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + // Support dimensions up to 4 + Window slice_out = window.collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions + // of input and output are the same + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp deleted file mode 100644 index 84a77122d..000000000 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/FixedPoint.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions, - "Reduction axis greater than max number of dimensions"); - - std::vector<uint32_t>::const_iterator it; - bool axis_w = false; - bool axis_h = false; - for (it = axis.begin(); it != axis.end(); ++it) - { - if ((*it) == 0) - { - axis_w = true; - } - else if ((*it) == 1) - { - axis_h = true; - } - else - { - ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); - } - } - // TODO Other axises (currently, only axises for both width and height are supported.) - if (!axis_w || !axis_h) - { - ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); - } - - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW); - } - - return Status{}; -} - -std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, - std::vector<uint32_t> axis) -{ - // Output tensor auto initialization if not yet initialized - TensorShape output_shape{input->tensor_shape()}; - output_shape.set(0, 1); - output_shape.set(1, 1); - auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(), - input->fixed_point_position()); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step - const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); - - Window win = calculate_max_window( - *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, - num_elems_processed_per_iteration_y); - AccessWindowHorizontal output_access(output, 0, 1); - bool window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, output->valid_region()); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - - return std::make_tuple(err, win); -} -} // namespace - -CLReductionMeanKernel::CLReductionMeanKernel() - : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size() -{ -} - -BorderSize CLReductionMeanKernel::border_size() const { return _border_size; } - -void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis)); - - _input = input; - _output = output; - _reduction_axis = axis; - - constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step - - // Set border size - _border_size = BorderSize( - ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - - input->info()->dimension(0)); - - // Set build options - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - // build_opts.emplace(("-DVEC_SIZE=" + - // support::cpp11::to_string(num_elems_processed_per_iteration))); - if (is_data_type_fixed_point(input->info()->data_type())) - { - build_opts.emplace("-DFIXED_POINT_POSITION=" + - support::cpp11::to_string(input->info()->fixed_point_position())); - } - - // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts)); - - // Configure kernel window - auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - ICLKernel::configure(std::get<1>(win_config)); -} - -Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>( - validate_and_configure_window(input->clone().get(), output->clone().get(), axis))); - - return Status{}; -} - -void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - // Set out window - Window out_window(window); - out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - - // Get first input and output slices - Window in_slice = window.first_slice_window_2D(); - Window out_slice = out_window.first_slice_window_2D(); - - // Set local sums buffer - // TODO work_group - unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size(); - - unsigned int idx = 2 * num_arguments_per_2D_tensor(); - _kernel.setArg(idx++, local_sum_size, nullptr); - _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height - _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) * - _input->info()->dimension(1))); // divider - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1)); - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice); - } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); -} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp new file mode 100644 index 000000000..6b0697e89 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size, + const ITensorInfo *padding_size, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(), + "The number of dimensions of input should be equal to output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(), + "The input and output layouts are different!"); + + // TODO Support other cases + if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4, + "CLSpaceToBatchNDKernel supports dimensions up to 4"); + + if (input->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info())); + + _input = input; + _block_size = block_size; + _padding_size = padding_size; + _output = output; + + // Set kernel build options + // TODO Support other cases + std::string kernel_name = "space_to_batch_4d"; + std::set<std::string> build_opts; + Window win; + + if (input->info()->data_layout() == DataLayout::NCHW) + { + kernel_name += "_nchw"; + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0))); + + win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + } + else if (input->info()->data_layout() == DataLayout::NHWC) + { + kernel_name += "_nhwc"; + build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + + win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->info()->valid_region()); + + if (window_changed) + { + ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!"); + } + } + else + { + ARM_COMPUTE_ERROR("Unsupported layout"); + } + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + ICLKernel::configure_internal(win); +} + +void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + +#if defined(DEBUG) + const_cast<ICLTensor *>(_block_size)->map(queue); + const_cast<ICLTensor *>(_padding_size)->map(queue); + + const size_t num_dimensions = _input->info()->num_dimensions(); + const size_t num_spacial_dimensions = _block_size->info()->dimension(0); + int32_t batch_size = _input->info()->dimension(num_dimensions - 1); + for (size_t i = 0; i < num_spacial_dimensions; ++i) + { + const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i})); + const int32_t padding_size_pre = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i})); + const int32_t padding_size_post = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i})); + + ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1"); + ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0, + "Padding size should be greater than or equal to 0"); + + if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(i) != + (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + else + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) != + (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) + + padding_size_pre + padding_size_post) / + block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + + batch_size *= block_size; + } + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - 1) != batch_size, + "Output batch size should be equal to input batch size * (multiplication of all block size)"); + + const_cast<ICLTensor *>(_block_size)->unmap(queue); + const_cast<ICLTensor *>(_padding_size)->unmap(queue); +#endif // defined(DEBUG) + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Set block size window + Window win_block = calculate_max_window(*_block_size->info(), Steps()); + + // Set padding size window + Window win_padding = calculate_max_window(*_padding_size->info(), Steps()); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + add_1D_tensor_argument(idx, _block_size, win_block); + add_2D_tensor_argument(idx, _padding_size, win_padding); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp new file mode 100644 index 000000000..5d6329edc --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3), + "Input batch should be equal to Output batch"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input->dimension(2) * block_size * block_size == output->dimension(2), + "Output depth should be equal to (input depth * block size *block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) && + !(input->dimension(1) % block_size), + "Input height and width should be divisible by block size"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) && + (output->dimension(1) == (input->dimension(1) / block_size)), + "Output height and width should be equal to " + "input_height/blocksize and input_width/blocksize respectively"); + + return Status{}; +} + +} // namespace + +CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp new file mode 100644 index 000000000..260bc39f1 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLSquaredDifferenceKernel::CLSquaredDifferenceKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input1->info()->data_type() == DataType::F16 && + input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input1->info()->data_type() == DataType::F32 || + input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLSquaredDifferenceKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp index 80ffd423a..48146a43a 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp @@ -14,43 +14,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" +#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include <string> - -using namespace std; using namespace arm_compute; -static const int32_t maxDim = 4; - -CLStridedSliceKernel::CLStridedSliceKernel() +CLStridedSliceExKernel::CLStridedSliceExKernel() : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0) { } -Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *begin, const ITensorInfo *end, - const ITensorInfo *strides, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) +Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *begin, const ITensorInfo *end, + const ITensorInfo *strides, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, - DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32); @@ -153,15 +140,6 @@ inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, return stop; } -inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) -{ - int32_t offset = b * shape[2] * shape[1] * shape[0]; - offset += d * shape[1] * shape[0]; - offset += h * shape[0]; - offset += w; - return offset; -} - inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) { int32_t ret = 0; @@ -177,10 +155,10 @@ inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) return ret; } -void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, - ICLTensor *beginData, ICLTensor *endData, - ICLTensor *stridesData, int32_t beginMask, int32_t endMask, - int32_t shrinkAxisMask) +void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output, + ICLTensor *beginData, ICLTensor *endData, + ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), beginMask, endMask, @@ -195,48 +173,31 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, _endMask = endMask; _shrinkAxisMask = shrinkAxisMask; - constexpr unsigned int num_elems_processed_per_iteration = 1; - // Set kernel build options std::set<std::string> build_opts; build_opts.emplace("-DELEMENT_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts)); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts)); - // Create output's window without padding - TensorShape collapsed = output->info()->tensor_shape(); - collapsed.collapse(4); - TensorInfo info = *output->info(); - info.set_tensor_shape(collapsed); - Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration)); - - ICLKernel::configure(win); + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + ICLKernel::configure_internal(win); } -void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) +void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - // Create input window - TensorShape collapsed = _input->info()->tensor_shape(); - collapsed.collapse(4); - TensorInfo info = *_input->info(); - info.set_tensor_shape(collapsed); - Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size())); - _beginData->map(queue); _endData->map(queue); _stridesData->map(queue); - std::vector<int32_t> dimsIn; - std::vector<int32_t> dimsOut; std::vector<int32_t> starts; - std::vector<int32_t> stops; std::vector<int32_t> strides; for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n) @@ -246,22 +207,13 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n], reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n)); - stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n], - reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, - n)); - strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]); - dimsIn.emplace_back(shape[n]); - dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n])); } for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) { starts.emplace_back(0); - stops.emplace_back(1); strides.emplace_back(1); - dimsIn.emplace_back(1); - dimsOut.emplace_back(1); } // TODO: Apply shrinkAxisMask @@ -269,20 +221,7 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) _stridesData->unmap(queue); _endData->unmap(queue); - // Set parameters - unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters - const cl_int4 dimsInArg = {{ - static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]), - static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]), - }}; - _kernel.setArg<cl_int4>(idx++, dimsInArg); - - const cl_int4 dimsOutArg = {{ - static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]), - static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]), - }}; - _kernel.setArg<cl_int4>(idx++, dimsOutArg); - + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters const cl_int4 startsArg = {{ static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]), static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]), @@ -295,10 +234,20 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) }}; _kernel.setArg<cl_int4>(idx++, stridesArg); - // TODO: Apply slicing output's window - idx = 0; - add_1D_tensor_argument(idx, _input, win_in); - add_1D_tensor_argument(idx, _output, window); + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); - enqueue(queue, *this, window); + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); } diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp index d95b485b7..073c2f7bb 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -17,15 +17,8 @@ #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <climits> -#include <cassert> namespace arm_compute { @@ -59,7 +52,7 @@ void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTens // Configure kernel window Window win; win.set(0, Window::Dimension(0, 1, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) @@ -102,7 +95,7 @@ void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffe // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) @@ -147,7 +140,7 @@ void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) // Configure kernel window Window win; win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -192,7 +185,7 @@ void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_ // Configure kernel window Window win; win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -236,7 +229,7 @@ void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buf // Configure kernel window Window win; win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -275,7 +268,7 @@ void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob // Configure kernel window Window win; win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) @@ -322,7 +315,7 @@ void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) // Configure kernel window Window win; win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) @@ -365,7 +358,7 @@ void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, in // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) @@ -404,7 +397,7 @@ void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int // Configure kernel window Window win; win.set(0, Window::Dimension(0, n, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) @@ -449,7 +442,7 @@ void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int // Configure kernel window Window win; win.set(0, Window::Dimension(0, k, 1)); - ICLKernel::configure(win); + ICLKernel::configure_internal(win); } void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) diff --git a/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp new file mode 100644 index 000000000..3b5782c25 --- /dev/null +++ b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEMath.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, + const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared); + + // Checks performed when output is configured + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *input_squared, + ITensorInfo *output, + const NormalizationLayerInfo &norm_info) +{ + unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); + const unsigned int num_elems_read_per_iteration = + num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); + const unsigned int num_rows = + (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1; + const unsigned int border_width = + (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U); + BorderSize border_size = BorderSize(0, border_width); + bool window_changed = false; + + // Configure window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration, + num_rows); + AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0, + num_elems_read_per_iteration, num_rows); + + if (output->total_size() != 0) + { + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = + update_window_and_padding(win, input_access, input_squared_access, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + else + { + window_changed = update_window_and_padding(win, input_access, input_squared_access); + } + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +NENormalizationLayerExKernel::NENormalizationLayerExKernel() + : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), + _norm_info(NormType::IN_MAP_1D), _border_size() +{ +} + +BorderSize NENormalizationLayerExKernel::border_size() const { return _border_size; } + +void NENormalizationLayerExKernel::configure(const ITensor *input, const ITensor *input_squared, + ITensor *output, NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output); + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), *input->info()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), input_squared->info(), output->info(), norm_info)); + + const unsigned int border_width = + (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U); + + _input = input; + _input_squared = input_squared; + _output = output; + _norm_info = norm_info; + _border_size = BorderSize(0, border_width); + + switch (_input->info()->data_type()) + { + case DataType::F32: + { + switch (norm_info.type()) + { + case NormType::IN_MAP_1D: + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, false>; + break; + case NormType::IN_MAP_2D: + // Normalize over X and Y + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, true>; + break; + case NormType::CROSS_MAP: + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 2, false>; + break; + default: + break; + } + break; + } + case DataType::F16: + { + switch (norm_info.type()) + { + case NormType::IN_MAP_1D: + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, false>; + break; + case NormType::IN_MAP_2D: + // Normalize over X and Y + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, true>; + break; + case NormType::CROSS_MAP: + _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 2, false>; + break; + default: + break; + } + break; + } + default: + ARM_COMPUTE_ERROR("NOT SUPPORTED!"); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), input_squared->info(), + output->info(), norm_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +template <DataType dt, unsigned int dim, bool do_2D_norm> +void NENormalizationLayerExKernel::normalize_float(const Window &window) +{ + Iterator input(_input, window); + Iterator input_squared(_input_squared, window); + Iterator output(_output, window); + + const int dim_y = 1; + const int radius = _norm_info.norm_size(); + const int total_size = _input->info()->dimension(dim) - 1; + const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim]; + // We account padding across X only and we iterate over rows + const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left); + const int max_right = (dim == 2) ? total_size : total_size + border_size().left; + const int min_top = 0; + const int max_bottom = _input->info()->dimension(dim_y) - 1; + + if (dt == DataType::F32) + { + const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff()); + const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta()); + const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa()); + + execute_window_loop( + window, + [&](const Coordinates &id) { + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int current_slice = id[dim]; + const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + const int first_slice = std::max(current_slice - radius, min_left); + const int last_slice = std::min(current_slice + radius, max_right); + + // Accumulate 2D In-Map values + float32x4_t accu = vdupq_n_f32(0.f); + for (int j = first_row; j <= last_row; j++) + { + // Compute row displacement + const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; + const uint8_t *const input_squared_ptr = + input_squared.ptr() + row - (current_slice * input_squared_stride); + for (int i = first_slice; i <= last_slice; ++i) + { + accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>( + input_squared_ptr + i * input_squared_stride))); + } + } + + // Normalize + const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec); + const float32x4_t normalized_pixel = vmulq_f32( + vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized)); + vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel); + }, + input, input_squared, output); + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else if (dt == DataType::F16) + { + const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff()); + const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta()); + const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa()); + + execute_window_loop( + window, + [&](const Coordinates &id) { + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int current_slice = id[dim]; + const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + const int first_slice = std::max(current_slice - radius, min_left); + const int last_slice = std::min(current_slice + radius, max_right); + + // Accumulate 2D In-Map values + float16x8_t accu = vdupq_n_f16(0.f); + for (int j = first_row; j <= last_row; j++) + { + // Compute row displacement + const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; + const uint8_t *const input_squared_ptr = + input_squared.ptr() + row - (current_slice * input_squared_stride); + for (int i = first_slice; i <= last_slice; ++i) + { + accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>( + input_squared_ptr + i * input_squared_stride))); + } + } + + const float16x8_t norm_f16 = + vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16); + const float16x8_t normalized_pixel = vmulq_f16( + vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16)); + vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel); + }, + input, input_squared, output); + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + else + { + ARM_COMPUTE_ERROR("Not supported"); + } +} + +Status NENormalizationLayerExKernel::validate(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + const NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + input_squared->clone().get(), + output->clone().get(), norm_info) + .first); + + return Status{}; +} + +void NENormalizationLayerExKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + // Run function + (this->*_func)(window); +} diff --git a/libs/ARMComputeEx/src/core/UtilsEx.cpp b/libs/ARMComputeEx/src/core/UtilsEx.cpp new file mode 100644 index 000000000..b63093bbb --- /dev/null +++ b/libs/ARMComputeEx/src/core/UtilsEx.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/UtilsEx.h" + +#include <cstdint> +#include <fstream> +#include <map> +#include <string> + +using namespace arm_compute; + +const std::string & +arm_compute::string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act) +{ + static std::map<ActivationLayerInfoEx::ActivationFunction, const std::string> act_map = { + {ActivationLayerInfoEx::ActivationFunction::RSQRT, "RSQRT"}, + }; + + return act_map[act]; +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp new file mode 100644 index 000000000..1e52fc429 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLActivationLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h" + +using namespace arm_compute; + +void CLActivationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ActivationLayerInfoEx act_info) +{ + auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerExKernel>(); + k->configure(input, output, act_info); + _kernel = std::move(k); +} + +Status CLActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ActivationLayerInfoEx &act_info) +{ + return CLActivationLayerExKernel::validate(input, output, act_info); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp new file mode 100644 index 000000000..dff743e89 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLArgMinMax.h" + +#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +namespace arm_compute +{ + +CLArgMinMax::CLArgMinMax() + : _input(nullptr), _output(nullptr), _argminmax_axis(), _interm_tensors(), _argminmax_kernels(), + _num_of_kernels() +{ +} + +void CLArgMinMax::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, + ArgOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op)); + _input = input; + _output = output; + _argminmax_axis = axis; + _arg_op = op; + // NOTE The argminmax_axis must have no duplication. + _num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = _num_of_kernels - 1; + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _argminmax_kernels = + arm_compute::support::cpp14::make_unique<CLArgMinMaxKernel[]>(_num_of_kernels); + + TensorShape shape{input->info()->tensor_shape()}; + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(_argminmax_axis[i], 1); + _interm_tensors[i].allocator()->init( + TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); + _interm_tensors[i].allocator()->allocate(); + } + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ArgMinMax on all kernels + for (size_t i = 0; i < _num_of_kernels; i++) + { + _argminmax_kernels[i].configure(tensors[i], tensors[i + 1], _argminmax_axis[i], op); + } +} + +Status CLArgMinMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis, + const ITensorInfo *output, ArgOperation op) +{ + const size_t num_of_kernels = argminmax_axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(argminmax_axis[i], 1); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate argminmax only on all kernels + for (size_t i = 0; i < num_of_kernels; i++) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxKernel::validate(tensors[i], tensors[i + 1], argminmax_axis[i], op)); + } + + return Status{}; +} + +void CLArgMinMax::run() +{ + for (size_t i = 0; i < _num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_argminmax_kernels[i]); + } +} + +} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp new file mode 100644 index 000000000..3f403c80a --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h" + +using namespace arm_compute; + +void CLArithmeticSubtractionEx::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + ConvertPolicy policy) +{ + auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionExKernel>(); + k->configure(input1, input2, output, policy); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} + +Status CLArithmeticSubtractionEx::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, ConvertPolicy policy) +{ + return CLArithmeticSubtractionExKernel::validate(input1, input2, output, policy); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp new file mode 100644 index 000000000..26e3798cc --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLBatchToSpaceND.h" + +#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h" + +using namespace arm_compute; + +void CLBatchToSpaceND::configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLBatchToSpaceNDKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp new file mode 100644 index 000000000..7c5fe5eda --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h" + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op) +{ + auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp index e1059ab53..8e106737c 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp @@ -17,7 +17,6 @@ #include "arm_compute/runtime/CL/functions/CLCast.h" #include "arm_compute/core/CL/kernels/CLCastKernel.h" -#include "support/ToolchainSupport.h" using namespace arm_compute; diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp new file mode 100644 index 000000000..f6a745a25 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLComparisonOp.h" + +#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLComparisonOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + const ComparisonOperation &op) +{ + auto k = arm_compute::support::cpp14::make_unique<CLComparisonOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp new file mode 100644 index 000000000..c2e4ca9ff --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h" + +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +using namespace arm_compute; + +void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp new file mode 100644 index 000000000..2781784ca --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +using namespace arm_compute; + +void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp new file mode 100644 index 000000000..411fa8700 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLExp.h" + +#include "arm_compute/core/CL/kernels/CLExpKernel.h" + +using namespace arm_compute; + +void CLExp::configure(const ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLExpKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp index 5552cbc6f..fb056fe45 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp @@ -16,11 +16,7 @@ */ #include "arm_compute/runtime/CL/functions/CLGather.h" -#include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/kernels/CLGatherKernel.h" -#include "support/ToolchainSupport.h" - -#include <utility> using namespace arm_compute; diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp new file mode 100644 index 000000000..7180e9356 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h" + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +using namespace arm_compute; + +void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp new file mode 100644 index 000000000..be35ea732 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLNeg.h" + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +using namespace arm_compute; + +void CLNeg::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp new file mode 100644 index 000000000..276c4557a --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLNormalizationLayerEx::CLNormalizationLayerEx() : _norm_kernel(), _border_handler() {} + +void CLNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + const NormalizationLayerInfo &norm_info) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr); + + // Configure normalization kernel + _norm_kernel.configure(input, output, norm_info); + + // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel + _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0)); +} + +Status CLNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) +{ + return CLNormalizationLayerExKernel::validate(input, output, norm_info); +} + +void CLNormalizationLayerEx::run() +{ + // Run border handler + CLScheduler::get().enqueue(_border_handler, false); + + // Run normalization kernel + CLScheduler::get().enqueue(_norm_kernel); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp new file mode 100644 index 000000000..38adedd10 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLPReLU.h" + +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>(); + k->configure(input, alpha, output); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp new file mode 100644 index 000000000..5265b6c34 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp @@ -0,0 +1,28 @@ +/* +* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +* Copyright (c) 2016-2018 ARM Limited. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" + +using namespace arm_compute; + +void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPadLayerKernel>(); + k->configure(input, output, pad_size); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp new file mode 100644 index 000000000..fb363270d --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLPermuteEx.h" + +#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h" + +using namespace arm_compute; + +void CLPermuteEx::configure(const ICLTensor *input, ICLTensor *output, + const PermutationVector &perm) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPermuteExKernel>(); + k->configure(input, output, perm); + _kernel = std::move(k); +} + +Status CLPermuteEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteExKernel::validate(input, output, perm)); + return Status{}; +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp index e1add5e90..dc0baa8dd 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp @@ -18,9 +18,6 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" -#include "support/ToolchainSupport.h" - -#include <utility> using namespace arm_compute; diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp deleted file mode 100644 index 3382058db..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLReduceMax.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "support/ToolchainSupport.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" - -#include <vector> -#include <algorithm> - -#include <utility> - -#define REDUCE_MAX_RUN_ON_CPU 1 - -namespace arm_compute -{ - -CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {} - -void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output) -{ - _axis = axis; - - _input = input; - _output = output; - - auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>(); - k->configure(input, axis, output); - _kernel = std::move(k); - - // We can handle for simple case only - // Output rank: 1 - // Axis: one axis value, restrict to 1 - ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2); - ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1); - ARM_COMPUTE_ERROR_ON(axis != 1); -} - -Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) -{ - return CLReduceMaxKernel::validate(input, axis, output); -} - -void CLReduceMax::run() -{ -#if REDUCE_MAX_RUN_ON_CPU - run_on_cpu(); - - arm_compute::CLScheduler::get().sync(); -#else - arm_compute::CLScheduler::get().enqueue(*_kernel); -#endif -} - -void CLReduceMax::run_on_cpu() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - _input->map(q); - _output->map(q); - - // Compute by CPU for simple case - // Input rank: 2 - // Output rank: 1 - // Axis: one axis value, restrict to 1 - - float *input_data = (float *)_input->buffer(); - float *output_data = (float *)_output->buffer(); - - std::vector<float> container_max; - int cols = _input->info()->tensor_shape()[0]; - int rows = _input->info()->tensor_shape()[1]; - container_max.resize(rows); - - // Initialize as 1st element in row - float *input_pointer = input_data; - for (int i = 0; i < rows; i++) - { - container_max[i] = *input_pointer; - input_pointer += cols; - } - - // Update max value in row - for (int i = 0; i < rows; i++) - { - float max_in_row = container_max[i]; - for (int j = 1; j < cols; j++) - { - if (max_in_row < input_data[i * cols + j]) - { - max_in_row = input_data[i * cols + j]; - } - } - container_max[i] = max_in_row; - } - - for (int i = 0; i < rows; i++) - { - output_data[i] = container_max[i]; - } - - _input->unmap(q); - _output->unmap(q); -} -} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp new file mode 100644 index 000000000..2b8d82706 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLReduceOperation.h" + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLReduceOperation::CLReduceOperation() + : _input(nullptr), _output(nullptr), _axis(), _interm_tensors(), _reduce_kernels() +{ +} + +Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, const ReduceOperation &op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate ReduceOperation only on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + } + + return Status{}; +} + +void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, + const std::set<uint32_t> &axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op)); + + _axis = axis; + + _input = input; + _output = output; + + // NOTE The axis must have no duplication. + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = + arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + + TensorShape shape{input->info()->tensor_shape()}; + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1); + _interm_tensors[i].allocator()->init( + TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); + _interm_tensors[i].allocator()->allocate(); + } + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ReduceOperation on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op); + } +} + +void CLReduceOperation::run() +{ + const size_t num_of_kernels = _axis.size(); + for (size_t i = 0; i < num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_reduce_kernels[i]); + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp deleted file mode 100644 index ab724e752..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017-2018 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLReductionMean.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/Tensor.h" -#include "support/ToolchainSupport.h" - -using namespace arm_compute; - -CLReductionMean::CLReductionMean() : _reduction_mean_kernel(), _fill_border_kernel() {} - -Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output, - std::vector<uint32_t> axis) -{ - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis)); - return Status{}; -} - -void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis) -{ - _reduction_mean_kernel.configure(input, output, axis); - _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT, - PixelValue(0)); -} - -void CLReductionMean::run() -{ - CLScheduler::get().enqueue(_fill_border_kernel); - CLScheduler::get().enqueue(_reduction_mean_kernel); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp new file mode 100644 index 000000000..c03826891 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +using namespace arm_compute; + +void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>(); + k->configure(input, block_size, padding_size, output); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp new file mode 100644 index 000000000..0f455f96f --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +using namespace arm_compute; + +void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp new file mode 100644 index 000000000..dc6e4af44 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSquaredDifference.h" + +#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLSquaredDifference::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSquaredDifferenceKernel>(); + k->configure(input1, input2, output); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp deleted file mode 100644 index cd576cec1..000000000 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * Copyright 2018 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "arm_compute/runtime/CL/functions/CLStridedSlice.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" -#include "arm_compute/core/utils/misc/Utility.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/ToolchainSupport.h" -#include <vector> - -using namespace arm_compute; - -static const int32_t maxDims = 4; - -// Return the index for the first element along that axis. This index will be a -// positive integer between [0, axisSize - 1] that can be used to index -// directly into the data. -inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices, - std::vector<int32_t> const &strides, const TensorShape &inputShape, - int32_t axis) -{ - // Begin with the specified index - int32_t start = startIndices[axis]; - - // beginMask override - if (beginMask & 1 << axis) - { - if (strides[axis] > 0) - { - // Forward iteration - use the first element. These values will get - // clamped below (Note: We could have set them to 0 and axisSize-1, but - // use lowest() and max() to maintain symmetry with StopForAxis()) - start = std::numeric_limits<int32_t>::lowest(); - } - else - { - // Backward iteration - use the last element. - start = std::numeric_limits<int32_t>::max(); - } - } - - // Handle negative indices - int32_t axisSize = inputShape[axis]; - if (start < 0) - { - start += axisSize; - } - - // Clamping - start = arm_compute::utility::clamp(start, 0, axisSize - 1); - - return start; -} - -// Return the "real" index for the end of iteration along that axis. This is an -// "end" in the traditional C sense, in that it points to one past the last -// element. ie. So if you were iterating through all elements of a 1D array of -// size 4, this function would return 4 as the stop, because it is one past the -// "real" indices of 0, 1, 2 & 3. -inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices, - std::vector<int32_t> const &strides, const TensorShape &inputShape, - int32_t axis) -{ - // Begin with the specified index - int32_t stop = stopIndices[axis]; - - // endMask override - if (endMask & (1 << axis)) - { - if (strides[axis] > 0) - { - // Forward iteration - use the last element. These values will get - // clamped below - stop = std::numeric_limits<int32_t>::max(); - } - else - { - // Backward iteration - use the first element. - stop = std::numeric_limits<int32_t>::lowest(); - } - } - - // Handle negative indices - int32_t axisSize = inputShape[axis]; - if (stop < 0) - { - stop += axisSize; - } - - // Clamping - // Because the end index points one past the last element, we need slightly - // different clamping ranges depending on the direction. - if (strides[axis] > 0) - { - // Forward iteration - stop = arm_compute::utility::clamp(stop, 0, axisSize); - } - else - { - // Backward iteration - stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); - } - - return stop; -} - -inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) -{ - int32_t offset = b * shape[2] * shape[1] * shape[0]; - offset += d * shape[1] * shape[0]; - offset += h * shape[0]; - offset += w; - return offset; -} - -void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) -{ - auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>(); - k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask); - _kernel = std::move(k); -} - -void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, - ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, - int32_t endMask, int32_t shrinkAxisMask) -{ - ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate( - input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), - beginMask, endMask, shrinkAxisMask)); - - _input = input; - _output = output; - _beginData = beginData; - _endData = endData; - _stridesData = stridesData; - _beginMask = beginMask; - _endMask = endMask; - _shrinkAxisMask = shrinkAxisMask; -} - -void CLStridedSliceCPU::run() -{ - run_on_cpu(); - - arm_compute::CLScheduler::get().sync(); -} - -inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) -{ - if (stride > 0) - { - return ((stop - start - 1) / stride) + 1; - } - else - { - return ((stop - start + 1) / stride) + 1; - } -} - -template <typename T> -inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask, - int32_t endMask, const std::vector<int32_t> &startIndices, - const std::vector<int32_t> &stopIndices, - const std::vector<int32_t> &strides, T *outputData) -{ - ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims); - ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims); - ARM_COMPUTE_ERROR_ON(strides.size() != maxDims); - - const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3); - const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3); - const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2); - const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2); - const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1); - const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1); - const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0); - const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0); - - // The shape of outputData may collapse in one-dimension. - // Therefore, it is necessary to create a shape that matches the result of the outputData. - TensorShape outputShape( - getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]), - getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3])); - for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b; - in_b += strides[3], b++) - { - for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d; - in_d += strides[2], d++) - { - for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h; - in_h += strides[1], h++) - { - for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w; - in_w += strides[0], w++) - { - outputData[offset4D(outputShape, b, d, h, w)] = - inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)]; - } - } - } - } -} - -void CLStridedSliceCPU::run_on_cpu() -{ - // TODO: Support shrinkAxisMask - cl::CommandQueue q = CLScheduler::get().queue(); - - _input->map(q); - _output->map(q); - _beginData->map(q); - _endData->map(q); - _stridesData->map(q); - - TensorShape inputShape = _input->info()->tensor_shape(); - TensorShape outputShape = _output->info()->tensor_shape(); - - std::vector<int32_t> starts; - std::vector<int32_t> stops; - std::vector<int32_t> strides; - - for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx) - { - starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]); - stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]); - strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]); - } - - for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++) - { - starts.emplace_back(0); - stops.emplace_back(1); - strides.emplace_back(1); - } - - switch (_input->info()->data_type()) - { - case DataType::U8: - case DataType::QASYMM8: - StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<uint8_t *>(_output->buffer())); - break; - case DataType::S8: - case DataType::QS8: - StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer())); - break; - case DataType::U16: - StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<uint16_t *>(_output->buffer())); - break; - case DataType::S16: - case DataType::QS16: - StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<int16_t *>(_output->buffer())); - break; - case DataType::F16: - // Not sure this works. - StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer())); - break; - case DataType::U32: - StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<uint32_t *>(_output->buffer())); - break; - case DataType::S32: - StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, - reinterpret_cast<int32_t *>(_output->buffer())); - break; - case DataType::F32: - StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask, - _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer())); - break; - default: - ARM_COMPUTE_ERROR("DataType not supported"); - break; - } - - _input->unmap(q); - _output->unmap(q); - _beginData->unmap(q); - _endData->unmap(q); - _stridesData->unmap(q); -} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp new file mode 100644 index 000000000..be7353493 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLStridedSliceEx.h" + +#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h" + +using namespace arm_compute; + +void CLStridedSliceEx::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) +{ + auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceExKernel>(); + k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask); + _kernel = std::move(k); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp index 6426364c9..19177497c 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -15,12 +15,9 @@ * limitations under the License. */ #include "arm_compute/runtime/CL/functions/CLTopKV2.h" +#include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/CLHelpers.h" - -#include <vector> -#include <algorithm> #include "../../topk_v2.h" diff --git a/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp new file mode 100644 index 000000000..988e92715 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NENormalizationLayerEx::NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), + _border_handler(), _input_squared() +{ +} + +void NENormalizationLayerEx::configure(const ITensor *input, ITensor *output, + const NormalizationLayerInfo &norm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), + input->info()->quantization_info()); + _input_squared.allocator()->init(tensor_info); + + // Manage intermediate buffers + _memory_group.manage(&_input_squared); + + // Configure kernels + _norm_kernel.configure(input, &_input_squared, output, norm_info); + _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, + PixelValue(0.0f)); + + // Allocate the tensor once the configure methods have been called + _input_squared.allocator()->allocate(); +} + +Status NENormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) +{ + // Perform validation step + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_RETURN_ON_ERROR( + NENormalizationLayerExKernel::validate(input, input, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate( + input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + return Status{}; +} + +void NENormalizationLayerEx::run() +{ + _memory_group.acquire(); + + NEScheduler::get().schedule(&_multiply_kernel, Window::DimY); + NEScheduler::get().schedule(&_border_handler, Window::DimY); + NEScheduler::get().schedule(&_norm_kernel, Window::DimY); + + _memory_group.release(); +} diff --git a/libs/ARMComputeEx/src/runtime/topk_v2.h b/libs/ARMComputeEx/src/runtime/topk_v2.h index a18ff0b0d..f94effea1 100644 --- a/libs/ARMComputeEx/src/runtime/topk_v2.h +++ b/libs/ARMComputeEx/src/runtime/topk_v2.h @@ -15,6 +15,12 @@ * limitations under the License. */ +/** + * @file topk_v2.h + * @brief This file contains TopK method and TopContainer class for TopK operation + * @ingroup COM_AI_RUNTIME + */ + #ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ #define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ @@ -26,34 +32,62 @@ namespace rt { namespace optimized_ops { -// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. -// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than -// TFLite. -//(TFLite additionaly supports kTfLiteInt64.) - -// The class that collects top indexes of k values. Based on template -// tensorflow::gtl::TopN<> but, for optimization, -// it re-uses the same container. +/** + * @brief class to define TopK operation + * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. + * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than + * TFLite. + * (TFLite additionaly supports kTfLiteInt64.) + * + * The class that collects top indexes of k values. Based on template + * tensorflow::gtl::TopN<> but, for optimization, + * it re-uses the same container. + */ template <typename T> class TopContainer { public: + /** + * @brief Prevent default constructor of of this class + */ TopContainer() = delete; + /** + * @brief Constructor with params + * @param [in] row_size Size of row in data + * @param [in] k The top k predictions + */ TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) { container_.reserve(std::min(k, row_size) + 1); } - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + */ TopContainer(const TopContainer &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ + /* + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + * @return Reference of TopContainer + */ TopContainer &operator=(const TopContainer &) = delete; + /** + * @brief Start collecting + * @param [in] values To set as values + * @return N/A + */ void start_collecting(const T *values) { values_ = values; container_.clear(); } + /** + * @brief Push a value to be compared for topk + * @param [in] a A value to compare + * @return N/A + */ void push(int32 a) { auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; @@ -74,6 +108,10 @@ public: } } + /** + * @brief Get sorted result from pushed values + * @return Reference of vector with sorted values + */ const std::vector<int32> &sorted_result() { auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; @@ -111,6 +149,16 @@ private: } }; +/** + * @brief Operates TopK operation with params + * @param [in] row_size Size of row in data + * @param [in] num_rows The number of rows in data + * @param [in] data To be operated in + * @param [in] k The top k predictions + * @param [out] output_indexes Indexes of targets in the top k predictions + * @param [out] output_values Values of targets in the top k predictions + * @return N/A + */ template <typename T> void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, T *output_values) diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index 687159725..99d2028f4 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -1,3 +1,4 @@ -add_subdirectory(util) -add_subdirectory(support) -add_subdirectory(ARMComputeEx) +# Add all subdirectories. +# Each library in sub-directory must have it's own CMakeLists.txt +# to build library's binaries or to support interface. +add_subdirectories() diff --git a/libs/cpp14/CMakeLists.txt b/libs/cpp14/CMakeLists.txt new file mode 100644 index 000000000..bba9e132d --- /dev/null +++ b/libs/cpp14/CMakeLists.txt @@ -0,0 +1,2 @@ +add_library(nnfw_lib_cpp14 INTERFACE) +target_include_directories(nnfw_lib_cpp14 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) diff --git a/libs/cpp14/include/cpp14/memory.h b/libs/cpp14/include/cpp14/memory.h new file mode 100644 index 000000000..b3e678baa --- /dev/null +++ b/libs/cpp14/include/cpp14/memory.h @@ -0,0 +1,29 @@ +/** + * @file memory.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains @c make_unique which is not supported by C++11 + */ +#ifndef __NNFW_CPP14_MEMORY_H__ +#define __NNFW_CPP14_MEMORY_H__ + +#include <memory> + +namespace nnfw +{ +namespace cpp14 +{ +/** + * @brief Provide @c make_unique function supported from C++14 + * @param[in] args List of arguments with which an instance of T will be constructed. + * @return @c std::unique_ptr of an instance of type T + */ +template <typename T, typename... Args> std::unique_ptr<T> make_unique(Args &&... args) +{ + // NOTE std::make_unique is missing in C++11 standard + return std::unique_ptr<T>(new T(std::forward<Args>(args)...)); +} + +} // napesapce cpp14 +} // namespace nnfw + +#endif // __NNFW_CPP14_MEMORY_H__ diff --git a/libs/misc/CMakeLists.txt b/libs/misc/CMakeLists.txt new file mode 100644 index 000000000..cd01695fb --- /dev/null +++ b/libs/misc/CMakeLists.txt @@ -0,0 +1,13 @@ +# Library `nnfw_lib_misc` +set(NNFW_UTILITY_SRCS src/environment.cpp) +list(APPEND NNFW_UTILITY_SRCS src/tensor/Shape.cpp) +list(APPEND NNFW_UTILITY_SRCS src/tensor/NonIncreasingStride.cpp) +list(APPEND NNFW_UTILITY_SRCS src/tensor/IndexFormatter.cpp) +list(APPEND NNFW_UTILITY_SRCS src/tensor/Comparator.cpp) + +add_library(nnfw_lib_misc STATIC ${NNFW_UTILITY_SRCS}) +target_include_directories(nnfw_lib_misc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) +set_target_properties(nnfw_lib_misc PROPERTIES POSITION_INDEPENDENT_CODE ON) + +add_executable(nnfw_tensor_index_iterator "examples/tensor_index_iterator.cpp") +target_link_libraries(nnfw_tensor_index_iterator nnfw_lib_misc) diff --git a/libs/util/examples/tensor_index_iterator.cpp b/libs/misc/examples/tensor_index_iterator.cpp index 284e04aa0..8a19dac87 100644 --- a/libs/util/examples/tensor_index_iterator.cpp +++ b/libs/misc/examples/tensor_index_iterator.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "util/tensor/IndexIterator.h" +#include "misc/tensor/IndexIterator.h" #include <array> @@ -25,14 +25,14 @@ void test_iterate(void) { - const nnfw::util::tensor::Shape shape{3, 4, 7}; + const nnfw::misc::tensor::Shape shape{3, 4, 7}; std::array<int, 3 * 4 * 7> array; array.fill(0); - using nnfw::util::tensor::iterate; - using nnfw::util::tensor::Index; + using nnfw::misc::tensor::iterate; + using nnfw::misc::tensor::Index; iterate(shape) << [&](const Index &index) { assert(index.rank() == shape.rank()); @@ -57,11 +57,11 @@ int main(int argc, char **argv) { test_iterate(); - nnfw::util::tensor::Shape shape{3, 4, 3, 4}; + nnfw::misc::tensor::Shape shape{3, 4, 3, 4}; std::cout << "Iterate over tensor{3, 4, 3, 4}" << std::endl; - nnfw::util::tensor::iterate(shape) << [](const nnfw::util::tensor::Index &index) { + nnfw::misc::tensor::iterate(shape) << [](const nnfw::misc::tensor::Index &index) { std::cout << "rank: " << index.rank() << std::endl; for (size_t d = 0; d < index.rank(); ++d) diff --git a/libs/misc/include/misc/EnvVar.h b/libs/misc/include/misc/EnvVar.h new file mode 100644 index 000000000..47206d4c0 --- /dev/null +++ b/libs/misc/include/misc/EnvVar.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file EnvVar.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::EnvVar class + */ + +#ifndef __NNFW_MISC_ENV_VAR__ +#define __NNFW_MISC_ENV_VAR__ + +#include <algorithm> +#include <array> +#include <cstdlib> +#include <string> + +namespace nnfw +{ +namespace misc +{ +/** + * @brief Class to access environment variable + */ +class EnvVar +{ +public: + /** + * @brief Construct a new EnvVar object + * @param[in] key environment variable + */ + EnvVar(const std::string &key) + { + const char *value = std::getenv(key.c_str()); + if (value == nullptr) + { + // An empty string is considered as an empty value + _value = ""; + } + else + { + _value = value; + } + } + + /** + * @brief Get environment variable of string type + * @param[in] def Default value of environment variable + * @return Defaut value passed as a parameter when there is no environment variable, + * otherwise the value of environment variable passed into constructor + */ + std::string asString(const std::string &def) const + { + if (_value.empty()) + return def; + return _value; + } + + /** + * @brief Get environment variable of boolean type + * @param[in] def Default value of environment variable + * @return Defaut value passed as a parameter when there is no environment variable, + * otherwise the value of environment variable passed into constructor + */ + bool asBool(bool def) const + { + if (_value.empty()) + return def; + static const std::array<std::string, 5> false_list{"0", "OFF", "FALSE", "N", "NO"}; + auto false_found = std::find(false_list.begin(), false_list.end(), _value); + return (false_found == false_list.end()); + } + + /** + * @brief Get environment variable of int type + * @param[in] def Default value of environment variable + * @return Defaut value passed as a parameter when there is no environment variable, + * otherwise the value of environment variable passed into constructor + */ + int asInt(int def) const + { + if (_value.empty()) + return def; + return std::stoi(_value); + } + +private: + std::string _value; +}; + +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_ENV_VAR__ diff --git a/libs/misc/include/misc/benchmark.h b/libs/misc/include/misc/benchmark.h new file mode 100644 index 000000000..fe5b97585 --- /dev/null +++ b/libs/misc/include/misc/benchmark.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file benchmark.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::benchmark::Accumulator class + */ +#ifndef __NNFW_MISC_BENCHMARK_H__ +#define __NNFW_MISC_BENCHMARK_H__ + +#include <chrono> + +namespace nnfw +{ +namespace misc +{ +// Benckmark support +namespace benchmark +{ + +/** + * @brief Class to accumulate time during benchmark + */ +template <typename T> class Accumulator +{ +public: + /** + * @brief Construct a new Accumulator object + * @param[in] ref Object to keep time duration + */ + Accumulator(T &ref) : _ref(ref) + { + // DO NOTHING + } + +public: + /** + * @brief Return the reference of @c ref passed to constructor + * @return Reference of @c ref + */ + T &operator()(void) { return _ref; } + +private: + T &_ref; +}; + +/** + * @brief Run passed function and returns accumulated time + * @tparam T Period used by @c std::chrono::duration_cast + * @tparam Callable Function type to benchmark + * @param[in] acc Accumulated time after running @cb + * @param[in] cb Function to run and benchmark + * @return Accumulated time + */ +template <typename T, typename Callable> +Accumulator<T> &operator<<(Accumulator<T> &&acc, Callable cb) +{ + auto begin = std::chrono::steady_clock::now(); + cb(); + auto end = std::chrono::steady_clock::now(); + + acc() += std::chrono::duration_cast<T>(end - begin); + + return acc; +} + +template <typename T> Accumulator<T> measure(T &out) { return Accumulator<T>(out); } + +} // namespace benchmark +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_BENCHMARK_H__ diff --git a/libs/misc/include/misc/environment.h b/libs/misc/include/misc/environment.h new file mode 100644 index 000000000..8e6bd00d5 --- /dev/null +++ b/libs/misc/include/misc/environment.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file environment.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains utility functions and classes to access environment variables + */ + +#ifndef __UTIL_ENVIRONMENT_H__ +#define __UTIL_ENVIRONMENT_H__ + +namespace nnfw +{ +namespace misc +{ + +/** + * @brief Get the environment variable of int type + * @param[in] name Name of the environment variable + * @param[in] defaultValue Default value when the value of environment variable does not exist + * @return The int value of the environment variable + */ +int get_env_int(const char *name, int defaultValue = 0); + +/** + * @brief Get the environment variable of bool type + * @param[in] name Name of the environment variable + * @param[in] defaultValue Default value when the value of environment variable does not exist + * @return @c 0 if the value of the environment variable is @c "0", @c 1 in case of other number + */ +bool get_env_bool(const char *name, bool defaultValue = false); +} +} + +#include <string> + +namespace nnfw +{ +namespace misc +{ +namespace env +{ +/** + * @brief Parent struct of @ref IntAccessor and @ref FloatAccessor + * @tparam T Type of the value of environment variable + */ +template <typename T> struct Accessor +{ + /** + * @brief Destroy the Accessor object + */ + virtual ~Accessor() = default; + /** + * @brief Read the value of environment variable + * @param[out] out The value of environment variable + * @return @c true if accessing environment variable is successful, + * @c false if there is exist no such environment variable + */ + virtual bool access(T &out) const = 0; +}; + +/** + * @brief Class to read int environment variable + */ +class IntAccessor : public Accessor<int> +{ +public: + /** + * @brief Construct a new IntAccessor object + * @param[in] tag Name of environment variable + */ + IntAccessor(const std::string &tag); + +public: + /** + * @brief Read the value of environment variable + * @param[out] out The value of environment variable + * @return @c true if accessing environment variable is successful, + * @c false if there is exist no such environment variable + */ + bool access(int &out) const override; + +private: + std::string _tag; +}; + +/** + * @brief Class to read float environment variable + */ +class FloatAccessor : public Accessor<float> +{ +public: + /** + * @brief Construct a new FloatAccessor object + * @param[in] tag Name of environment variable + */ + FloatAccessor(const std::string &tag); + +public: + /** + * @brief Read the value of environment variable + * @param[out] out The value of environment variable + * @return @c true if accessing environment variable is successful, + * @c false if there is exist no such environment variable + */ + bool access(float &out) const override; + +private: + std::string _tag; +}; + +} // namespace env +} // namespace misc +} // namespace nnfw + +#endif // __UTIL_ENVIRONMENT_H__ diff --git a/libs/misc/include/misc/feature/Index.h b/libs/misc/include/misc/feature/Index.h new file mode 100644 index 000000000..a361d8dd2 --- /dev/null +++ b/libs/misc/include/misc/feature/Index.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Index.h + * @brief This file contains Index class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_FEATURE_INDEX_H__ +#define __NNFW_MISC_FEATURE_INDEX_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace feature +{ + +/** + * @brief Class to have the index information for calculating the offset. + */ +class Index +{ +public: + /** + * @brief Construct Index object using default constrcutor + */ + Index() = default; + +public: + /** + * @brief Construct Index object with three indexes of dimensions + * @param[in] ch The depth index + * @param[in] row The heigth index + * @param[in] col The width index + */ + Index(int32_t ch, int32_t row, int32_t col) : _batch{1}, _ch{ch}, _row{row}, _col{col} + { + // DO NOTHING + } + /** + * @brief Construct Index object with four indexes of dimensions + * @param[in] batch The batch index + * @param[in] ch The depth index + * @param[in] row The height index + * @param[in] col The width index + */ + Index(int32_t batch, int32_t ch, int32_t row, int32_t col) + : _batch{batch}, _ch{ch}, _row{row}, _col{col} + { + // DO NOTHING + } + +public: + /** + * @brief Get the batch index + * @return The batch index + */ + int32_t batch(void) const { return _batch; } + /** + * @brief Get the depth index + * @return The depth index + */ + int32_t ch(void) const { return _ch; } + /** + * @brief Get the height index + * @return The height index + */ + int32_t row(void) const { return _row; } + /** + * @brief Get the width index + * @return The width index + */ + int32_t col(void) const { return _col; } + +public: + /** + * @brief Get the batch index as the lvalue reference + * @return The reference of the batch value + */ + int32_t &batch(void) { return _batch; } + /** + * @brief Get the depth index as the lvalue reference + * @return The reference of the depth value + */ + int32_t &ch(void) { return _ch; } + /** + * @brief Get the height index as the lvalue reference + * @return The reference of the height value + */ + int32_t &row(void) { return _row; } + /** + * @brief Get the width index as the lvalue reference + * @return The reference of the width value + */ + int32_t &col(void) { return _col; } + +private: + /** + * @brief The batch index + */ + int32_t _batch; + /** + * @brief The depth index + */ + int32_t _ch; + /** + * @brief The height index + */ + int32_t _row; + /** + * @brief The width index + */ + int32_t _col; +}; + +} // namespace feature +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FEATURE_INDEX_H__ diff --git a/libs/misc/include/misc/feature/IndexIterator.h b/libs/misc/include/misc/feature/IndexIterator.h new file mode 100644 index 000000000..1cf675526 --- /dev/null +++ b/libs/misc/include/misc/feature/IndexIterator.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file IndexIterator.h + * @brief This file contains IndexIterator class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_FEATURE_INDEX_ITERATOR_H__ +#define __NNFW_MISC_FEATURE_INDEX_ITERATOR_H__ + +#include "misc/feature/Shape.h" + +namespace nnfw +{ +namespace misc +{ +namespace feature +{ + +/** + * @brief Class to iterate Callable with Index of feature + */ +class IndexIterator +{ +public: + /** + * @brief Construct IndexIterator object with Shape of feature + * @param[in] shape Shape reference of feature + */ + IndexIterator(const Shape &shape) : _shape{shape} + { + // DO NOTHING + } + +public: + /** + * @brief Call a function iterated + * @param[in] cb A callback function + * @return Current IndexIterator object + */ + template <typename Callable> IndexIterator &iter(Callable cb) + { + for (int32_t batch = 0; batch < _shape.N; ++batch) + { + for (int32_t ch = 0; ch < _shape.C; ++ch) + { + for (int32_t row = 0; row < _shape.H; ++row) + { + for (int32_t col = 0; col < _shape.W; ++col) + { + cb(batch, ch, row, col); + } + } + } + } + + return (*this); + } + +private: + /** + * @brief Shape for feature + */ + const Shape _shape; +}; + +/** + * @brief Create an object of IndexIterator for feature + * @param[in] Shape reference of feature + * @return Created IndexIterator object + */ +static inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; } + +/** + * @brief Call a function iterated using IndexIterator of feature + * Overloaded operator<< + * @param[in] it An IndexIterator reference + * @param[in] cb A callback function + * @return created IndexIterator object + */ +template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb) +{ + return it.iter(cb); +} + +} // namespace feature +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FEATURE_INDEX_ITERATOR_H__ diff --git a/libs/misc/include/misc/feature/Object.h b/libs/misc/include/misc/feature/Object.h new file mode 100644 index 000000000..7af0e28f4 --- /dev/null +++ b/libs/misc/include/misc/feature/Object.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Object.h + * @brief This file contains Object class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_FEATURE_OBJECT_H__ +#define __NNFW_MISC_FEATURE_OBJECT_H__ + +#include "misc/feature/Shape.h" +#include "misc/feature/Index.h" +#include "misc/feature/Reader.h" + +#include <vector> + +namespace nnfw +{ +namespace misc +{ +namespace feature +{ + +/** + * @brief Class to have information of the operand for feature + */ +template <typename T> class Object final : public Reader<T> +{ +public: + using Generator = std::function<T(const Shape &shape, const Index &index)>; + +public: + /** + * @brief Construct Object object with Shape of feature and set value used by Generator + * @param[in] shape Reference of Shape for feature + * @param[in] fn A function to set values of operand tensor + */ + Object(const Shape &shape, const Generator &fn) : _shape{shape} + { + _value.resize(_shape.C * _shape.H * _shape.W); + + for (int32_t ch = 0; ch < _shape.C; ++ch) + { + for (int32_t row = 0; row < _shape.H; ++row) + { + for (int32_t col = 0; col < _shape.W; ++col) + { + _value.at(offsetOf(ch, row, col)) = fn(_shape, Index{ch, row, col}); + } + } + } + } + +public: + /** + * @brief Get Shape of feature as the reference + * @return The reference of the width value + */ + const Shape &shape(void) const { return _shape; } + +public: + /** + * @brief Get the value used by three indexes + * @param[in] ch The depth index + * @param[in] row The height index + * @param[in] col The width index + * @return The value at the offset + */ + T at(uint32_t ch, uint32_t row, uint32_t col) const override + { + return _value.at(offsetOf(ch, row, col)); + } + +private: + /** + * @brief Get the offset value at three indexes + * @param[in] ch The depth index + * @param[in] row The height index + * @param[in] col The width index + * @return The offset value + */ + uint32_t offsetOf(uint32_t ch, uint32_t row, uint32_t col) const + { + return ch * _shape.H * _shape.W + row * _shape.W + col; + } + +private: + /** + * @brief Shape of operand + */ + Shape _shape; + /** + * @brief The tensor vector of operand + */ + std::vector<T> _value; +}; + +} // namespace feature +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FEATURE_OBJECT_H__ diff --git a/libs/misc/include/misc/feature/Reader.h b/libs/misc/include/misc/feature/Reader.h new file mode 100644 index 000000000..b09209789 --- /dev/null +++ b/libs/misc/include/misc/feature/Reader.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Reader.h + * @brief This file contains Reader class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_FEATURE_READER_H__ +#define __NNFW_MISC_FEATURE_READER_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace feature +{ + +/** + * @brief Class reads values of feature + * The interface class + */ +template <typename T> struct Reader +{ + /** + * @brief Destruct Reader object using default destructor + */ + virtual ~Reader() = default; + + /** + * @brief Get the value used by three indexes + * @param[in] ch The depth index + * @param[in] row The height index + * @param[in] col The width index + * @return The value at the offset + */ + virtual T at(uint32_t ch, uint32_t row, uint32_t col) const = 0; + /** + * @brief Get the value used by four indexes + * @param[in] batch The batch index + * @param[in] ch The depth index + * @param[in] row The height index + * @param[in] col The width index + * @return The value at the offset + */ + virtual T at(uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) const = 0; +}; + +} // namespace feature +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FEATURE_READER_H__ diff --git a/libs/misc/include/misc/feature/Shape.h b/libs/misc/include/misc/feature/Shape.h new file mode 100644 index 000000000..09881f58b --- /dev/null +++ b/libs/misc/include/misc/feature/Shape.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Shape.h + * @brief This file contains Shape class for feature + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_FEATURE_SHAPE_H__ +#define __NNFW_MISC_FEATURE_SHAPE_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace feature +{ + +/** + * @brief Structure to have values of dimensions for feature + */ +struct Shape +{ + int32_t N; /**< The batch value */ + int32_t C; /**< The depth value */ + int32_t H; /**< The height value */ + int32_t W; /**< The width value */ + + /** + * @brief Construct Shape object using default constrcutor + */ + Shape() = default; + /** + * @brief Construct Shape object with three values of dimensions + * @param[in] depth The depth value + * @param[in] height The height value + * @param[in] width The width value + */ + Shape(int32_t depth, int32_t height, int32_t width) : N{1}, C{depth}, H{height}, W{width} + { + // DO NOTHING + } + /** + * @brief Construct Shape object with four values of dimensions + * @param[in] batch The batch value + * @param[in] depth The depth value + * @param[in] height The height value + * @param[in] width The width value + */ + Shape(int32_t batch, int32_t depth, int32_t height, int32_t width) + : N{batch}, C{depth}, H{height}, W{width} + { + // DO NOTHING + } +}; + +} // namespace feature +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FEATURE_H__ diff --git a/libs/misc/include/misc/feature/TextFormatter.h b/libs/misc/include/misc/feature/TextFormatter.h new file mode 100644 index 000000000..e053f1c61 --- /dev/null +++ b/libs/misc/include/misc/feature/TextFormatter.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file TextFormatter.h + * @brief This file contains TextFormatter class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_FEATURE_TEXT_FORMATTER_H__ +#define __NNFW_MISC_FEATURE_TEXT_FORMATTER_H__ + +#include "misc/feature/Shape.h" +#include "misc/feature/Reader.h" + +#include <ostream> +#include <iomanip> +#include <limits> + +namespace nnfw +{ +namespace misc +{ +namespace feature +{ + +/** + * @brief Class to print operand of feature to ostream in the given string format + */ +template <typename T> class TextFormatter +{ +public: + /** + * @brief Construct TextFormatter object with an operand's information. + * @param[in] shape The shape of an operand + * @param[in] data The data of an operand + */ + TextFormatter(const Shape &shape, const Reader<T> &data) : _shape(shape), _data(data) + { + // DO NOTHING + } + +public: + /** + * @brief Get Shape of feature as the lvalue reference + * @return Shape of feature + */ + const Shape &shape(void) const { return _shape; } + /** + * @brief Get Reader<T> that can read the data of an operand + * @return Reader<T> + */ + const Reader<T> &data(void) const { return _data; } + +private: + /** + * @brief Shape of feature + */ + const Shape &_shape; + /** + * @brief Reader<T> that can read the data of an operand + */ + const Reader<T> &_data; +}; + +/** + * @brief Print operand of feature + * @param[in] os Standard output stream + * @param[in] fmt TextFormatter to print information of an operand + * @return Standard output stream + */ +template <typename T> std::ostream &operator<<(std::ostream &os, const TextFormatter<T> &fmt) +{ + const auto &shape = fmt.shape(); + + for (uint32_t ch = 0; ch < shape.C; ++ch) + { + os << " Channel " << ch << ":" << std::endl; + for (uint32_t row = 0; row < shape.H; ++row) + { + os << " "; + for (uint32_t col = 0; col < shape.W; ++col) + { + const auto value = fmt.data().at(ch, row, col); + os << std::right; + os << std::fixed; + os << std::setw(std::numeric_limits<T>::digits10 + 2); + os << std::setprecision(5); + os << value; + os << " "; + } + os << std::endl; + } + } + + return os; +} + +} // namespace feature +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FEATURE_TEXT_FORMATTER_H__ diff --git a/libs/misc/include/misc/fp32.h b/libs/misc/include/misc/fp32.h new file mode 100644 index 000000000..c310402ba --- /dev/null +++ b/libs/misc/include/misc/fp32.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file fp32.h + * @brief This file contains functions to compare float values + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_FP32_H__ +#define __NNFW_MISC_FP32_H__ + +#include <cmath> +#include <cfloat> +#include <algorithm> +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace fp32 +{ + +/** + * @brief Get the difference between two float values as a relative value. + * @param[in] lhs A float value to be compared + * @param[in] rhs A float value to be compared + * @return A relative value of difference between two float values. + */ +inline float relative_diff(float lhs, float rhs) +{ + const auto diff = std::fabs(lhs - rhs); + const auto base = std::max(std::fabs(lhs), std::fabs(rhs)); + + return diff / base; +} + +/** + * @brief Verify that an obtained float value is equal to the expected float value + * by using FLT_EPSILON + * @param[in] expected An expected float value to be compared + * @param[in] obtained An obtained float value to be compared + * @param[in] tolerance A tolerance value + * @return @c true if both values are equal, otherwise @c false + */ +inline bool epsilon_equal(float expected, float obtained, uint32_t tolerance = 1) +{ + if (std::isnan(expected) && std::isnan(obtained)) + { + return true; + } + + // Let's use relative epsilon comparision + const auto diff = std::fabs(expected - obtained); + const auto max = std::max(std::fabs(expected), std::fabs(obtained)); + + return diff <= (max * FLT_EPSILON * tolerance); +} + +/** + * @brief Verify that an obtained float value is equal to the expected float value + * by comparing absolute tolerance value + * @param[in] expected An expected float value to be compared + * @param[in] obtained An obtained float value to be compared + * @param[in] tolerance A tolerance value + * @return @c true if both values are equal, otherwise @c false + */ +inline bool absolute_epsilon_equal(float expected, float obtained, float tolerance = 0.001) +{ + if (std::isnan(expected) && std::isnan(obtained)) + { + return true; + } + + // Let's use absolute epsilon comparision + const auto diff = std::fabs(expected - obtained); + + return diff <= tolerance; +} + +} // namespace fp32 +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FP32_H__ diff --git a/libs/misc/include/misc/kernel/IndexIterator.h b/libs/misc/include/misc/kernel/IndexIterator.h new file mode 100644 index 000000000..59e0f0095 --- /dev/null +++ b/libs/misc/include/misc/kernel/IndexIterator.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file IndexIterator.h + * @brief This file contains IndexIterator class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_KERNEL_INDEX_ITERATOR_H__ +#define __NNFW_MISC_KERNEL_INDEX_ITERATOR_H__ + +#include "misc/kernel/Shape.h" + +namespace nnfw +{ +namespace misc +{ +namespace kernel +{ + +/** + * @brief Class to iterate Callable with Index of kernel + */ +class IndexIterator +{ +public: + /** + * @brief Construct IndexIterator object with Shape of kernel + * @param[in] shape Shape reference of feature + */ + IndexIterator(const Shape &shape) : _shape{shape} + { + // DO NOTHING + } + +public: + /** + * @brief Call a function iterated + * @param[in] cb A callback function + * @return Current IndexIterator object + */ + template <typename Callable> IndexIterator &iter(Callable cb) + { + for (int32_t nth = 0; nth < _shape.N; ++nth) + { + for (int32_t ch = 0; ch < _shape.C; ++ch) + { + for (int32_t row = 0; row < _shape.H; ++row) + { + for (int32_t col = 0; col < _shape.W; ++col) + { + cb(nth, ch, row, col); + } + } + } + } + + return (*this); + } + +private: + const Shape _shape; /**< Shape for kernel */ +}; + +/** + * @brief Create an object of IndexIterator for kernel + * @param[in] shape reference of feature + * @return Created IndexIterator object + */ +inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; } + +/** + * @brief Call a function iterated using IndexIterator of kernel + * Overloaded operator<< + * @param[in] it An IndexIterator reference + * @param[in] cb A callback function + * @return Created IndexIterator object + */ +template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb) +{ + return it.iter(cb); +} + +} // namespace kernel +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FEATURE_INDEX_ITERATOR_H__ diff --git a/libs/misc/include/misc/kernel/RandomObject.h b/libs/misc/include/misc/kernel/RandomObject.h new file mode 100644 index 000000000..4b58b0c7f --- /dev/null +++ b/libs/misc/include/misc/kernel/RandomObject.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file RandomObject.h + * @brief This file contains RandomObject class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_KERNEL_RANDOM_OBJECT_H__ +#define __NNFW_MISC_KERNEL_RANDOM_OBJECT_H__ + +#include "misc/kernel/Shape.h" +#include "misc/kernel/Reader.h" + +#include <vector> + +namespace nnfw +{ +namespace misc +{ +namespace kernel +{ + +template <typename T> class RandomObject final : public Reader<T> +{ +public: + RandomObject(const Shape &shape) : _shape{shape} + { + const uint32_t size = _shape.N * _shape.C * _shape.H * _shape.W; + + // TODO Use random number + for (uint32_t off = 0; off < size; ++off) + { + _value.emplace_back(static_cast<float>(off)); + } + } + +public: + const Shape &shape(void) const { return _shape; } + +public: + T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const override + { + uint32_t index = 0; + + index += nth * _shape.C * _shape.H * _shape.W; + index += ch * _shape.H * _shape.W; + index += row * _shape.W; + index += col; + + return _value.at(index); + } + +private: + const Shape _shape; + std::vector<T> _value; +}; + +} // namespace kernel +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_KERNEL_RANDOM_OBJECT_H__ diff --git a/libs/misc/include/misc/kernel/Reader.h b/libs/misc/include/misc/kernel/Reader.h new file mode 100644 index 000000000..019c809ee --- /dev/null +++ b/libs/misc/include/misc/kernel/Reader.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Reader.h + * @brief This file contains Reader structure + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_KERNEL_READER_H__ +#define __NNFW_MISC_KERNEL_READER_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace kernel +{ + +/** + * @brief Structure to Reader + */ +template <typename T> struct Reader +{ + /** + * @brief Destroy the Reader object as default + */ + virtual ~Reader() = default; + + /** + * @brief Get the value used by four indexes + * @param[in] nth The kernel index + * @param[in] ch The channel index + * @param[in] row The row index + * @param[in] col The column index + * @return The value at the offset + */ + virtual T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const = 0; +}; + +} // namespace kernel +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_KERNEL_READER_H__ diff --git a/libs/misc/include/misc/kernel/Shape.h b/libs/misc/include/misc/kernel/Shape.h new file mode 100644 index 000000000..27d6a8bf0 --- /dev/null +++ b/libs/misc/include/misc/kernel/Shape.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Shape.h + * @brief This file contains Shape structure + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_KERNEL_SHAPE_H__ +#define __NNFW_MISC_KERNEL_SHAPE_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace kernel +{ + +/** + * @brief Structure to Shape + */ +struct Shape +{ + int32_t N; /**< The kernel index */ + int32_t C; /**< The channel index */ + int32_t H; /**< The height index */ + int32_t W; /**< The width index */ + + /** + * @brief Construct a new Shape object as default + */ + Shape() = default; + + /** + * @brief Construct a new Shape object with parameters + * @param[in] count The kernel index + * @param[in] depth The channel index + * @param[in] height The height index + * @param[in] width The width index + */ + Shape(int32_t count, int32_t depth, int32_t height, int32_t width) + : N{count}, C{depth}, H{height}, W{width} + { + // DO NOTHING + } +}; + +} // namespace kernel +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_KERNEL_SHAPE_H__ diff --git a/libs/misc/include/misc/matrix/IndexIterator.h b/libs/misc/include/misc/matrix/IndexIterator.h new file mode 100644 index 000000000..742ed3a65 --- /dev/null +++ b/libs/misc/include/misc/matrix/IndexIterator.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file IndexIterator.h + * @brief This file contains IndexIterator class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_MATRIX_INDEX_ITERATOR_H__ +#define __NNFW_MISC_MATRIX_INDEX_ITERATOR_H__ + +#include "misc/matrix/Shape.h" + +namespace nnfw +{ +namespace misc +{ +namespace matrix +{ + +/** + * @brief Class to iterate Callable with Index of matrix + */ +class IndexIterator +{ +public: + /** + * @brief Construct IndexIterator object with Shape of matrix + * @param[in] shape Shape reference of matrix + */ + IndexIterator(const Shape &shape) : _shape{shape} + { + // DO NOTHING + } + +public: + /** + * @brief Call a function iterated + * @param[in] cb A callback function + * @return Current IndexIterator object + */ + template <typename Callable> IndexIterator &iter(Callable cb) + { + for (uint32_t row = 0; row < _shape.H; ++row) + { + for (uint32_t col = 0; col < _shape.W; ++col) + { + cb(row, col); + } + } + + return (*this); + } + +private: + /** + * @brief Shape for matrix + */ + const Shape _shape; +}; + +/** + * @brief Create an object of IndexIterator for matrix + * @param[in] Shape reference of matrix + * @return Created IndexIterator object + */ +inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; } + +/** + * @brief Call a function iterated using IndexIterator of matrix + * Overloaded operator<< + * @param[in] it An IndexIterator reference + * @param[in] cb A callback function + * @return created IndexIterator object + */ +template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb) +{ + return it.iter(cb); +} + +} // namespace matrix +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_MATRIX_INDEX_ITERATOR_H__ diff --git a/libs/misc/include/misc/matrix/Reader.h b/libs/misc/include/misc/matrix/Reader.h new file mode 100644 index 000000000..ea222c9d1 --- /dev/null +++ b/libs/misc/include/misc/matrix/Reader.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Reader.h + * @brief This file contains Reader class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_MATRIX_READER_H__ +#define __NNFW_MISC_MATRIX_READER_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace matrix +{ + +/** + * @brief Class reads values of matrix + * The interface class + */ +template <typename T> struct Reader +{ + /** + * @brief Destruct Reader object using default destructor + */ + virtual ~Reader() = default; + + /** + * @brief Get the value used by two indexes + * @param[in] row The height index + * @param[in] col The width index + * @return The value at the offset + */ + virtual T at(uint32_t row, uint32_t col) const = 0; +}; + +} // namespace matrix +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_MATRIX_READER_H__ diff --git a/libs/misc/include/misc/matrix/Shape.h b/libs/misc/include/misc/matrix/Shape.h new file mode 100644 index 000000000..8cbcc1e12 --- /dev/null +++ b/libs/misc/include/misc/matrix/Shape.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Shape.h + * @brief This file contains Shape class for matrix + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_MATRIX_SHAPE_H__ +#define __NNFW_MISC_MATRIX_SHAPE_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace matrix +{ + +/** + * @brief Structure to have values of dimensions for matrix + */ +struct Shape +{ + int32_t H; /**< The height value */ + int32_t W; /**< The width value */ + + /** + * @brief Construct Shape object using default constrcutor + */ + Shape() = default; + + /** + * @brief Construct Shape object with two values of dimensions + * @param[in] height The height value + * @param[in] width The width value + */ + Shape(int32_t height, int32_t width) : H{height}, W{width} + { + // DO NOTHING + } +}; + +} // namespace matrix +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_MATRIX_SHAPE_H__ diff --git a/libs/misc/include/misc/tensor/Comparator.h b/libs/misc/include/misc/tensor/Comparator.h new file mode 100644 index 000000000..80f53043c --- /dev/null +++ b/libs/misc/include/misc/tensor/Comparator.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Comparator.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::Comparator class + */ + +#ifndef __NNFW_MISC_TENSOR_COMPARATOR_H__ +#define __NNFW_MISC_TENSOR_COMPARATOR_H__ + +#include "misc/tensor/Index.h" +#include "misc/tensor/Shape.h" +#include "misc/tensor/Reader.h" +#include "misc/tensor/Diff.h" + +#include <functional> + +#include <vector> + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Class to compare two tensors (expected and obtained to compare) + */ +class Comparator +{ +public: + /** + * @brief Construct a new @c Comparator object + * @param[in] fn Function that compares two float values + */ + Comparator(const std::function<bool(float lhs, float rhs)> &fn) : _compare_fn{fn} + { + // DO NOTHING + } + +public: + /** + * @brief Struct to observe comparison results + */ + struct Observer + { + /** + * @brief Get notification of comparison result at every index of two tensors + * @param[in] index Index of tensors compared + * @param[in] expected Expected value of element at @c index + * @param[in] obtained Obtained value of element at @c index + * @return N/A + */ + virtual void notify(const Index &index, float expected, float obtained) = 0; + }; + +public: + /** + * @brief Compare two tensors + * @param[in] shape Shape of two tensors + * @param[in] expected @c Reader<float> object that accesses expected tensor + * @param[in] obtained @c Reader<float> object that accesses obtained tensor + * @param[in] observer @c Observer notified of expected value and obtained value at every index + * @return @c std::vector<Diff<float>> containing information of failed comparison + */ + // NOTE Observer should live longer than comparator + std::vector<Diff<float>> compare(const Shape &shape, const Reader<float> &expected, + const Reader<float> &obtained, + Observer *observer = nullptr) const; + +private: + std::function<bool(float lhs, float rhs)> _compare_fn; +}; + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_COMPARATOR_H__ diff --git a/libs/misc/include/misc/tensor/Diff.h b/libs/misc/include/misc/tensor/Diff.h new file mode 100644 index 000000000..c41a97987 --- /dev/null +++ b/libs/misc/include/misc/tensor/Diff.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Diff.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::Diff struct + */ + +#ifndef __NNFW_MISC_TENSOR_DIFF_H__ +#define __NNFW_MISC_TENSOR_DIFF_H__ + +#include "misc/tensor/Index.h" + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Struct to have information after comparing two elements of two tensors + */ +template <typename T> struct Diff +{ + Index index; /**< Index of elements in two tensors, which turn out to be different */ + + T expected; /**< Expected value of element of first tensor */ + T obtained; /**< Obtained value of element of second tensor */ + + /** + * @brief Construct a new @c Diff object + * @param[in] i Initial value of index + */ + Diff(const Index &i) : index(i) + { + // DO NOTHING + } + + /** + * @brief Construct a new @c Diff object + * @param[in] i Index value + * @param[in] e Expected value of element of first tensor + * @param[in] o Obtained value of element of second tensor + */ + Diff(const Index &i, const T &e, const T &o) : index(i), expected{e}, obtained{o} + { + // DO NOTHING + } +}; + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_DIFF_H__ diff --git a/libs/misc/include/misc/tensor/Index.h b/libs/misc/include/misc/tensor/Index.h new file mode 100644 index 000000000..a08d7099e --- /dev/null +++ b/libs/misc/include/misc/tensor/Index.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Index.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::Index struct + */ +#ifndef __NNFW_MISC_TENSOR_INDEX_H__ +#define __NNFW_MISC_TENSOR_INDEX_H__ + +#include <cstdint> +#include <cstddef> + +#include <vector> +#include <initializer_list> + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Struct to represent index of each dimension of a tensor + */ +struct Index +{ +public: + /** + * @brief Construct a new @c Index object + * @param[in] rank Rank of a tensor + */ + Index(size_t rank) { _offsets.resize(rank); } + +public: + /** + * @brief Construct a new @c Index object + * @param[in] offsets Rank of a tensor of @c std::initializer_list<int32_t> type + */ + Index(std::initializer_list<int32_t> offsets) : _offsets{offsets} + { + // DO NOTHING + } + +public: + /** + * @brief Get the rank + * @return Rank that this @c Index object can handle + */ + size_t rank(void) const { return _offsets.size(); } + +public: + /** + * @brief Get the index n'th dimension + * @param[in] n Dimension + * @return index of n'th dimension + */ + int32_t at(size_t n) const { return _offsets.at(n); } + + /** + * @brief Get the reference of the index n'th dimension + * @param[in] n Dimension + * @return reference of index of n'th dimension + */ + int32_t &at(size_t n) { return _offsets.at(n); } + +private: + std::vector<int32_t> _offsets; +}; + +/** + * @brief Copy an @c Index with reversed order + * @param[in] origin @c Index object to copy + * @return an @c Index object with reversed order + * @note This is used to convert NNAPI tensor index to ARM tensor index or vice versa + */ +inline static Index copy_reverse(const Index &origin) +{ + size_t rank = origin.rank(); + Index target(rank); + for (int i = 0; i < rank; i++) + target.at(i) = origin.at(rank - 1 - i); + return target; +} + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_INDEX_H__ diff --git a/libs/misc/include/misc/tensor/IndexEnumerator.h b/libs/misc/include/misc/tensor/IndexEnumerator.h new file mode 100644 index 000000000..4912ea289 --- /dev/null +++ b/libs/misc/include/misc/tensor/IndexEnumerator.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file IndexEnumerator.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::IndexEnumerator class + */ + +#ifndef __NNFW_MISC_TENSOR_INDEX_ENUMERATOR_H__ +#define __NNFW_MISC_TENSOR_INDEX_ENUMERATOR_H__ + +#include "misc/tensor/Shape.h" +#include "misc/tensor/Index.h" + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ +/** + * @brief Class to enumerate index of a tensor + * + */ +class IndexEnumerator +{ +public: + /** + * @brief Construct a new @c IndexEnumerator object + * @param[in] shape Shape of tensor of which index will be enumerate + */ + explicit IndexEnumerator(const Shape &shape) : _shape(shape), _index(shape.rank()), _cursor(0) + { + const size_t rank = _shape.rank(); + + for (size_t axis = 0; axis < rank; ++axis) + { + _index.at(axis) = 0; + } + + for (_cursor = 0; _cursor < rank; ++_cursor) + { + if (_index.at(_cursor) < _shape.dim(_cursor)) + { + break; + } + } + } + +public: + /** + * @brief Prevent constructing @c IndexEnumerator object by using R-value reference + */ + IndexEnumerator(IndexEnumerator &&) = delete; + /** + * @brief Prevent copy constructor + */ + IndexEnumerator(const IndexEnumerator &) = delete; + +public: + /** + * @brief Check if more enumeration is available + * @return @c true if more @c advance() is available, otherwise @c false + */ + bool valid(void) const { return _cursor < _shape.rank(); } + +public: + /** + * @brief Get the current index to enumerate + * @return Current index + */ + const Index &curr(void) const { return _index; } + +public: + /** + * @brief Advance index by +1 + */ + void advance(void) + { + const size_t rank = _shape.rank(); + + // Find axis to be updated + while ((_cursor < rank) && !(_index.at(_cursor) + 1 < _shape.dim(_cursor))) + { + ++_cursor; + } + + if (_cursor == rank) + { + return; + } + + // Update index + _index.at(_cursor) += 1; + + for (size_t axis = 0; axis < _cursor; ++axis) + { + _index.at(axis) = 0; + } + + // Update cursor + _cursor = 0; + } + +public: + const Shape _shape; //!< Shape to enumerate + +private: + size_t _cursor; + Index _index; +}; + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_INDEX_ENUMERATOR_H__ diff --git a/libs/misc/include/misc/tensor/IndexFormatter.h b/libs/misc/include/misc/tensor/IndexFormatter.h new file mode 100644 index 000000000..7ae34eec1 --- /dev/null +++ b/libs/misc/include/misc/tensor/IndexFormatter.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file IndexFormatter.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::IndexFormatter class + */ + +#ifndef __NNFW_MISC_TENSOR_INDEX_FORMATTER_H__ +#define __NNFW_MISC_TENSOR_INDEX_FORMATTER_H__ + +#include "misc/tensor/Index.h" + +#include <ostream> + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Class to send @c Index object to output stream + */ +class IndexFormatter +{ +public: + /** + * @brief Construct a new @c IndexFormatter object + * @param[in] index index to be sent to output stream + */ + IndexFormatter(const nnfw::misc::tensor::Index &index) : _index(index) + { + // DO NOTHING + } + +public: + /** + * @brief Get an @c Index object + * @return @c Index object previously passed to the constructor + */ + const nnfw::misc::tensor::Index &index(void) const { return _index; } + +private: + const nnfw::misc::tensor::Index &_index; +}; + +/** + * @brief Send @c IndexFormatter object to output stream + * @param[in] os Output stream + * @param[in] fmt @c IndexFormatter object that is sent to output stream + * @return Output stream + */ +std::ostream &operator<<(std::ostream &os, const IndexFormatter &fmt); + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_INDEX_FORMATTER_H__ diff --git a/libs/misc/include/misc/tensor/IndexIterator.h b/libs/misc/include/misc/tensor/IndexIterator.h new file mode 100644 index 000000000..f6428e19e --- /dev/null +++ b/libs/misc/include/misc/tensor/IndexIterator.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file IndexIterator.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::IndexIterator class and + * helper function and operator + */ +#ifndef __NNFW_MISC_TENSOR_INDEX_ITERATOR_H__ +#define __NNFW_MISC_TENSOR_INDEX_ITERATOR_H__ + +#include "misc/tensor/Shape.h" +#include "misc/tensor/Index.h" +#include "misc/tensor/IndexEnumerator.h" + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Class to iterate indexes available for given shape + */ +class IndexIterator +{ +public: + /** + * @brief Construct a new @c IndexIterator object + * @param[in] shape Shape of tensor of which index will be iterated + */ + IndexIterator(const Shape &shape) : _shape(shape) + { + // DO NOTHING + } + +public: + /** + * @brief Construct a new IndexIterator object using reference + * @param[in] IndexIterator @c IndexIterator object to move + */ + IndexIterator(IndexIterator &&) = default; + + /** + * @brief Prevent copy constructor + */ + IndexIterator(const IndexIterator &) = delete; + +public: + /** + * @brief Iterate all available indexes and run a function for each index + * @param[in] fn Function that requires an index as a parameter. + * @return @c IndexIterator object + */ + template <typename Callable> IndexIterator &iter(Callable fn) + { + for (IndexEnumerator e{_shape}; e.valid(); e.advance()) + { + fn(e.curr()); + } + + return (*this); + } + +private: + const Shape &_shape; +}; + +/** + * @brief Get an @c IndexItator object + * @param[in] shape Shape of tensor of which index will be iterated + * @return @c IndexIterator object + */ +inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; } + +/** + * @brief Iterate all indexes and apply a function + * @param[in] it @c IndexIterator object that is constructed with a tensor shape + * @param[in] cb A function that will receive a specific index. + * Inside the function, the index is used to manipulate tensor element. + * @return @c IndexIterator object + */ +template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb) +{ + return it.iter(cb); +} + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_INDEX_ITERATOR_H__ diff --git a/libs/misc/include/misc/tensor/NonIncreasingStride.h b/libs/misc/include/misc/tensor/NonIncreasingStride.h new file mode 100644 index 000000000..e7ad0857b --- /dev/null +++ b/libs/misc/include/misc/tensor/NonIncreasingStride.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file NonIncreasingStride.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::NonIncreasingStride class + */ +#ifndef __NNFW_MISC_TENSOR_NON_INCREASING_STRIDE_H__ +#define __NNFW_MISC_TENSOR_NON_INCREASING_STRIDE_H__ + +#include "misc/tensor/Shape.h" +#include "misc/tensor/Index.h" + +#include <vector> + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Class to represent strides where stride[N-1] >= stride[N] holds for all N < rank + */ +class NonIncreasingStride +{ +public: + /** + * @brief Initialize the stride data using @c Shape + * @param[in] shape to build stride info + * @return N/A + */ + void init(const Shape &shape) + { + _stride.resize(shape.rank()); + _stride.at(shape.rank() - 1) = 1; + + for (uint32_t axis = shape.rank() - 1; axis > 0; --axis) + { + _stride.at(axis - 1) = _stride.at(axis) * shape.dim(axis); + } + } + +public: + /** + * @brief Get an stride value for specific axis + * @param[in] axis Axis of stride + * @return The value of stride + */ + uint32_t at(uint32_t axis) const { return _stride.at(axis); } + +public: + /** + * @brief Get the 1-D offset of specified index for n-D tensor + * @param index @c Index object + * @return 1-D offset of index + */ + uint32_t offset(const Index &index) const; + +private: + std::vector<uint32_t> _stride; +}; + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_NON_INCREASING_STRIDE_H__ diff --git a/libs/misc/include/misc/tensor/Object.h b/libs/misc/include/misc/tensor/Object.h new file mode 100644 index 000000000..83fbc0bd1 --- /dev/null +++ b/libs/misc/include/misc/tensor/Object.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Object.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::Object class + */ + +#ifndef __NNFW_MISC_TENSOR_OBJECT_H__ +#define __NNFW_MISC_TENSOR_OBJECT_H__ + +#include "misc/tensor/Shape.h" +#include "misc/tensor/Index.h" +#include "misc/tensor/IndexIterator.h" +#include "misc/tensor/NonIncreasingStride.h" +#include "misc/tensor/Reader.h" + +#include <vector> + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Class to build a tensor using specific generator + * @tparam T Type of tensor element + */ + +template <typename T> class Object final : public Reader<T> +{ +public: + /** + * @brief Function to generate tensor element + */ + using Generator = std::function<T(const Shape &shape, const Index &index)>; + +public: + /** + * @brief Construct a new @c Object object + * @param[in] shape Tensor shape + * @param[in] fn Function to generate tensor elements + */ + Object(const Shape &shape, const Generator &fn) : _shape{shape} + { + // Set 'stride' + _stride.init(shape); + + // Pre-allocate buffer + _values.resize(_shape.dim(0) * _stride.at(0)); + + // Set 'value' + iterate(_shape) << + [this, &fn](const Index &index) { _values.at(_stride.offset(index)) = fn(_shape, index); }; + } + +public: + /** + * @brief Get reference of shape + * @return Reference of shape + */ + const Shape &shape(void) const { return _shape; } + +public: + /** + * @brief Get and element of tensor + * @param[in] index Index of a tensor element + * @return Value of tensor element + */ + T at(const Index &index) const override { return _values.at(_stride.offset(index)); } + +private: + Shape _shape; + NonIncreasingStride _stride; + +private: + std::vector<T> _values; +}; + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_FEATURE_OBJECT_H__ diff --git a/libs/misc/include/misc/tensor/Reader.h b/libs/misc/include/misc/tensor/Reader.h new file mode 100644 index 000000000..9175a913e --- /dev/null +++ b/libs/misc/include/misc/tensor/Reader.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Reader.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::Reader struct + */ + +#ifndef __NNFW_MISC_TENSOR_READER_H__ +#define __NNFW_MISC_TENSOR_READER_H__ + +#include "misc/tensor/Index.h" + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Struct to read element of tensor + * @tparam T Type of elements in tensor + */ +template <typename T> struct Reader +{ + /** + * @brief Destroy the Reader object + */ + virtual ~Reader() = default; + + /** + * @brief Get an element of tensor + * @param[in] index Index specifying indexes of tensor element + * @return The value of specificed element + */ + virtual T at(const Index &index) const = 0; +}; + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_READER_H__ diff --git a/libs/misc/include/misc/tensor/Shape.h b/libs/misc/include/misc/tensor/Shape.h new file mode 100644 index 000000000..6e6c23502 --- /dev/null +++ b/libs/misc/include/misc/tensor/Shape.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Shape.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::Shape class + */ + +#ifndef __NNFW_MISC_TENSOR_SHAPE_H__ +#define __NNFW_MISC_TENSOR_SHAPE_H__ + +#include <cstdint> +#include <cstddef> +#include <deque> +#include <initializer_list> +#include <ostream> +#include <string> + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Class to represent shape of a tensor + */ +class Shape +{ +public: + /** + * @brief Construct a new Shape object + * @param[in] rank Rank of a tensor + */ + Shape(size_t rank) { _dimensions.resize(rank); } + +public: + /** + * @brief Construct a new Shape object + * @param[in] dimensions @c initializer_list<int32_t> of dimensions of tensor + */ + Shape(const std::initializer_list<int32_t> &dimensions) : _dimensions{dimensions} + { + // DO NOTHING + } + + /** + * @brief Construct a new Shape object + * @param[in] origin @c Shape object to copy + */ + Shape(const Shape &origin) = default; + +public: + /** + * @brief Add dimension to the beginning + * @param[in] d dimension to add to the beginning + * @return N/A + */ + void prepend(int32_t d) { _dimensions.emplace_front(d); } + + /** + * @brief Add dimension to the back + * @param[in] d dimension to add to the back + * @return N/A + */ + void append(int32_t d) { _dimensions.emplace_back(d); } + +public: + /** + * @brief Get the rank of this shape + * @return rank + */ + size_t rank(void) const { return _dimensions.size(); } + +public: + /** + * @brief Get specific dimension + * @param[in] n Index of dimension + * @return n'th dimension + */ + int32_t dim(size_t n) const { return _dimensions.at(n); } + + /** + * @brief Get the reference of specific dimension + * @param[in] n Index of dimension + * @return Reference of n'th dimension + */ + int32_t &dim(size_t n) { return _dimensions.at(n); } + +public: + /** + * @brief Get the number of elements specified by this shape + * @return The number of elements + */ + size_t element_nums() const + { + size_t nums = 1; + for (auto d : _dimensions) + { + nums *= d; + } + return nums; + } + +private: + std::deque<int32_t> _dimensions; + +public: + /** + * @brief Get a @c Shape object after parsing string + * @param[in] s String of dimension list. Accepted format is numbers separated by comma. + * @return @c Shape object + */ + static Shape from(const std::string &s); +}; + +/** + * @brief Check equality of two @c Shape + * @param[in] Shape First shape to compare + * @param[in] Shape Second shape to compare + * @return @c true if both shapes are equal, otherwise @c false + */ +bool operator==(const Shape &, const Shape &); + +/** + * @brief Send @c Shape to @c std::ostream + * @param[in] os @c std::ostream to process this @c Shape + * @param[in] shape @c Shape to send to @c ostream + * @return Reference of @c std::ostream + */ +std::ostream &operator<<(std::ostream &os, const Shape &shape); + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_SHAPE_H__ diff --git a/libs/misc/include/misc/tensor/Zipper.h b/libs/misc/include/misc/tensor/Zipper.h new file mode 100644 index 000000000..8f0ec4ab6 --- /dev/null +++ b/libs/misc/include/misc/tensor/Zipper.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Zipper.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains nnfw::misc::tensor::Zipper class + */ + +#ifndef __NNFW_MISC_TENSOR_ZIPPER_H__ +#define __NNFW_MISC_TENSOR_ZIPPER_H__ + +#include "misc/tensor/Index.h" +#include "misc/tensor/IndexIterator.h" +#include "misc/tensor/Reader.h" + +namespace nnfw +{ +namespace misc +{ +namespace tensor +{ + +/** + * @brief Class to apply a function with three params: @c Index, elements of a tensor + * at passed index read by @c Reader objects + */ +template <typename T> class Zipper +{ +public: + /** + * @brief Construct a new @c Zipper object + * @param[in] shape Shape of @c lhs and @c rhs + * @param[in] lhs @c Reader object of a tensor + * @param[in] rhs @c Reader object of a tensor + */ + Zipper(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs) + : _shape{shape}, _lhs{lhs}, _rhs{rhs} + { + // DO NOTHING + } + +public: + /** + * @brief Apply @c cb to all elements of tensors. Elements of two tensors + * at passed @c index are read by @c lhs and @c rhs + * @param[in] cb Function to apply + * @return N/A + */ + template <typename Callable> void zip(Callable cb) const + { + iterate(_shape) << + [this, &cb](const Index &index) { cb(index, _lhs.at(index), _rhs.at(index)); }; + } + +private: + const Shape &_shape; + const Reader<T> &_lhs; + const Reader<T> &_rhs; +}; + +/** + * @brief Apply @c cb by using @c lhs and @c rhs passed to the constructor of @c zipper + * @param[in] zipper @c Zipper object + * @param[in] cb Function to zpply using @c zip function + * @return @c zipper object after applying @c cb to @c zipper + */ +template <typename T, typename Callable> +const Zipper<T> &operator<<(const Zipper<T> &zipper, Callable cb) +{ + zipper.zip(cb); + return zipper; +} + +/** + * @brief Get @c Zipper object constructed using passed params + * @param shape Shape of @c lhs and @c rhs + * @param lhs @c Reader object of a tensor + * @param rhs @c Reader object of a tensor + * @return @c Zipper object + */ +template <typename T> Zipper<T> zip(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs) +{ + return Zipper<T>{shape, lhs, rhs}; +} + +} // namespace tensor +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_TENSOR_ZIPPER_H__ diff --git a/libs/misc/include/misc/vector.h b/libs/misc/include/misc/vector.h new file mode 100644 index 000000000..395b08912 --- /dev/null +++ b/libs/misc/include/misc/vector.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file vector.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains @c == operator to check equality of elements in two vectors + */ +#ifndef __NNFW_MISC_VECTOR_H__ +#define __NNFW_MISC_VECTOR_H__ + +#include <vector> + +/** + * @brief Compare elements of two vectors + * @tparam T Type of elements in vectors + * @param[in] lhs First vector to compare + * @param[in] rhs Second vector to compare + * @return @c true if all elements are equal, otherwise @c false. + */ +template <typename T> bool operator==(const std::vector<T> &lhs, const std::vector<T> &rhs) +{ + if (lhs.size() != rhs.size()) + { + return false; + } + + for (size_t ind = 0; ind < lhs.size(); ++ind) + { + if (lhs.at(ind) != rhs.at(ind)) + { + return false; + } + } + + return true; +} + +#endif // __NNFW_MISC_VECTOR_H__ diff --git a/libs/misc/include/misc/vector/Object.h b/libs/misc/include/misc/vector/Object.h new file mode 100644 index 000000000..65d4bc613 --- /dev/null +++ b/libs/misc/include/misc/vector/Object.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Object.h + * @brief This file contains Object class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_VECTOR_OBJECT_H__ +#define __NNFW_MISC_VECTOR_OBJECT_H__ + +#include "misc/vector/Reader.h" + +#include <vector> +#include <functional> + +namespace nnfw +{ +namespace misc +{ +namespace vector +{ + +/** + * @brief Class to have information of the operand for vector + */ +template <typename T> class Object final : public Reader<T> +{ +public: + using Generator = std::function<T(int32_t size, int32_t offset)>; + +public: + /** + * @brief Construct Object object with size of vector and set value used by Generator + * @param[in] size The size of vector + * @param[in] gen A function to set values of operand tensor + */ + Object(int32_t size, const Generator &gen) : _size{size} + { + _value.resize(_size); + + for (int32_t offset = 0; offset < size; ++offset) + { + _value.at(offset) = gen(size, offset); + } + } + +public: + /** + * @brief Get size of vector + * @return Size of vector + */ + int32_t size(void) const { return _size; } + +public: + /** + * @brief Get the value used by index + * @param[in] nth The vector index + * @return The value at the offset + */ + T at(uint32_t nth) const override { return _value.at(nth); } + +private: + /** + * @brief Size of vector + */ + const int32_t _size; + /** + * @brief The tensor vector of operand + */ + std::vector<T> _value; +}; + +} // namespace vector +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_VECTOR_OBJECT_H__ diff --git a/libs/misc/include/misc/vector/Reader.h b/libs/misc/include/misc/vector/Reader.h new file mode 100644 index 000000000..eab4c427b --- /dev/null +++ b/libs/misc/include/misc/vector/Reader.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Reader.h + * @brief This file contains Reader class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_MISC_VECTOR_READER_H__ +#define __NNFW_MISC_VECTOR_READER_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace misc +{ +namespace vector +{ + +/** + * @brief Class reads values of vector + * The interface class + */ +template <typename T> struct Reader +{ + /** + * @brief Destruct Reader object using default destructor + */ + virtual ~Reader() = default; + + /** + * @brief Get the value used by the index + * @param[in] nth The vector index + * @return The value at the offset + */ + virtual T at(uint32_t nth) const = 0; +}; + +} // namespace vector +} // namespace misc +} // namespace nnfw + +#endif // __NNFW_MISC_VECTOR_READER_H__ diff --git a/libs/util/src/environment.cpp b/libs/misc/src/environment.cpp index 4b18b409f..e39f18d62 100644 --- a/libs/util/src/environment.cpp +++ b/libs/misc/src/environment.cpp @@ -18,11 +18,11 @@ #include <cstdlib> #include <string> -#include "util/environment.h" +#include "misc/environment.h" namespace nnfw { -namespace util +namespace misc { int get_env_int(const char *name, int defaultValue) @@ -44,12 +44,12 @@ bool get_env_bool(const char *name, bool defaultValue) return defaultValue; } -} // namespace util +} // namespace misc } // namespace nnfw namespace nnfw { -namespace util +namespace misc { namespace env { @@ -91,5 +91,5 @@ bool FloatAccessor::access(float &out) const } } // namespace env -} // namespace util +} // namespace misc } // namespace nnfw diff --git a/libs/util/src/tensor/Comparator.cpp b/libs/misc/src/tensor/Comparator.cpp index 89cd687e9..013c9eed2 100644 --- a/libs/util/src/tensor/Comparator.cpp +++ b/libs/misc/src/tensor/Comparator.cpp @@ -1,11 +1,11 @@ -#include "util/tensor/Comparator.h" -#include "util/tensor/Zipper.h" +#include "misc/tensor/Comparator.h" +#include "misc/tensor/Zipper.h" -#include "util/fp32.h" +#include "misc/fp32.h" namespace nnfw { -namespace util +namespace misc { namespace tensor { @@ -18,7 +18,7 @@ std::vector<Diff<float>> Comparator::compare(const Shape &shape, const Reader<fl zip(shape, expected, obtained) << [&](const Index &index, float expected_value, float obtained_value) { - const auto relative_diff = nnfw::util::fp32::relative_diff(expected_value, obtained_value); + const auto relative_diff = nnfw::misc::fp32::relative_diff(expected_value, obtained_value); if (!_compare_fn(expected_value, obtained_value)) { @@ -36,5 +36,5 @@ std::vector<Diff<float>> Comparator::compare(const Shape &shape, const Reader<fl } } // namespace tensor -} // namespace util +} // namespace misc } // namespace nnfw diff --git a/libs/util/src/tensor/IndexFormatter.cpp b/libs/misc/src/tensor/IndexFormatter.cpp index 66ff80771..c949db7a8 100644 --- a/libs/util/src/tensor/IndexFormatter.cpp +++ b/libs/misc/src/tensor/IndexFormatter.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "util/tensor/IndexFormatter.h" +#include "misc/tensor/IndexFormatter.h" #include <cassert> namespace nnfw { -namespace util +namespace misc { namespace tensor { @@ -45,5 +45,5 @@ std::ostream &operator<<(std::ostream &os, const IndexFormatter &fmt) } } // namespace tensor -} // namespace util +} // namespace misc } // namespace nnfw diff --git a/libs/util/src/tensor/NonIncreasingStride.cpp b/libs/misc/src/tensor/NonIncreasingStride.cpp index 3774ded83..c51ad0324 100644 --- a/libs/util/src/tensor/NonIncreasingStride.cpp +++ b/libs/misc/src/tensor/NonIncreasingStride.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "util/tensor/NonIncreasingStride.h" +#include "misc/tensor/NonIncreasingStride.h" #include <cassert> namespace nnfw { -namespace util +namespace misc { namespace tensor { @@ -42,5 +42,5 @@ uint32_t NonIncreasingStride::offset(const Index &index) const } } // namespace tensor -} // namespace util +} // namespace misc } // namespace nnfw diff --git a/libs/util/src/tensor/Shape.cpp b/libs/misc/src/tensor/Shape.cpp index f1de26fdc..675695e8e 100644 --- a/libs/util/src/tensor/Shape.cpp +++ b/libs/misc/src/tensor/Shape.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "util/tensor/Shape.h" +#include "misc/tensor/Shape.h" #include <cassert> namespace nnfw { -namespace util +namespace misc { namespace tensor { @@ -95,5 +95,5 @@ std::ostream &operator<<(std::ostream &os, const Shape &shape) } } // namespace tensor -} // namespace util +} // namespace misc } // namespace nnfw diff --git a/libs/profiling/CMakeLists.txt b/libs/profiling/CMakeLists.txt new file mode 100644 index 000000000..7169508a1 --- /dev/null +++ b/libs/profiling/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB_RECURSE SOURCES "src/*.cpp") + +add_library(nnfw_lib_profiling STATIC ${SOURCES}) +set_property(TARGET nnfw_lib_profiling PROPERTY POSITION_INDEPENDENT_CODE ON) +target_include_directories(nnfw_lib_profiling PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) diff --git a/libs/profiling/include/profiling/profile_buffer.h b/libs/profiling/include/profiling/profile_buffer.h new file mode 100644 index 000000000..83cd3eb2b --- /dev/null +++ b/libs/profiling/include/profiling/profile_buffer.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + +// NOTE This header is derived from the following file (in TensorFlow v1.12) +// 'externals/tensorflow/tensorflow/contrib/lite/profiling/profile_buffer.h +#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_ +#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_ + +#include <cstddef> +#include <cstdint> + +#include "profiling/time.h" + +namespace tflite { +namespace profiling { + +// A profiling event. +struct ProfileEvent { + // Describes the type of event. + // The event_metadata field may contain additional data for interpreting + // the event. + enum class EventType { + // Default event type, the metadata field has no special significance. + DEFAULT = 0, + // The event is an operator invocation and the event_metadata field is the + // index of operator node. + OPERATOR_INVOKE_EVENT = 1 + }; + + // Label of the event. This usually describes the event. + const char* tag; + // Timestamp in microseconds when the event began. + uint64_t begin_timestamp_us; + // Timestamp in microseconds when the event ended. + uint64_t end_timestamp_us; + // The field containing the type of event. This must be one of the event types + // in EventType. + EventType event_type; + // Extra data describing the details of the event. + uint32_t event_metadata; +}; +} // namespace profiling +} // namespace tflite + +#ifdef TFLITE_PROFILING_ENABLED + +#include <sys/time.h> +#include <vector> + +namespace tflite { +namespace profiling { +constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1; + +// A ring buffer of profile events. +// This class is not thread safe. +class ProfileBuffer { + public: + ProfileBuffer(uint32_t max_num_entries, bool enabled) + : enabled_(enabled), current_index_(0), event_buffer_(max_num_entries) {} + + // Adds an event to the buffer with begin timestamp set to the current + // timestamp. Returns a handle to event that can be used to call EndEvent. If + // buffer is disabled this has no affect. + // The tag of the event should remain valid till the buffer is valid. + uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type, + uint32_t event_metadata) { + if (!enabled_) { + return kInvalidEventHandle; + } + uint64_t timestamp = time::NowMicros(); + int index = current_index_ % event_buffer_.size(); + event_buffer_[index].tag = tag; + event_buffer_[index].event_type = event_type; + event_buffer_[index].event_metadata = event_metadata; + event_buffer_[index].begin_timestamp_us = timestamp; + event_buffer_[index].end_timestamp_us = 0; + current_index_++; + return index; + } + + // Sets the enabled state of buffer to |enabled| + void SetEnabled(bool enabled) { enabled_ = enabled; } + + // Sets the end timestamp for event for the handle to current time. + // If the buffer is disabled or previous event has been overwritten this + // operation has not effect. + void EndEvent(uint32_t event_handle) { + if (!enabled_ || event_handle == kInvalidEventHandle || + event_handle > current_index_) { + return; + } + const uint32_t max_size = event_buffer_.size(); + if (current_index_ > (max_size + event_handle)) { + // Ignore, buffer has already overflowed. + return; + } + + int event_index = event_handle % max_size; + event_buffer_[event_index].end_timestamp_us = time::NowMicros(); + } + + // Returns the size of the buffer. + size_t Size() const { + return (current_index_ >= event_buffer_.size()) ? event_buffer_.size() + : current_index_; + } + + // Resets the buffer. + void Reset() { + enabled_ = false; + current_index_ = 0; + } + + // Returns the profile event at the given index. If the index is invalid a + // nullptr is returned. The return event may get overwritten if more events + // are added to buffer. + const struct ProfileEvent* const At(int index) const { + size_t size = Size(); + if (index >= size) { + return nullptr; + } + const uint32_t max_size = event_buffer_.size(); + uint32_t start = + (current_index_ > max_size) ? current_index_ % max_size : max_size; + index = (index + start) % max_size; + return &event_buffer_[index]; + } + + private: + bool enabled_; + uint32_t current_index_; + std::vector<ProfileEvent> event_buffer_; +}; +} // namespace profiling +} // namespace tflite +#endif // TFLITE_PROFILING_ENABLED +#endif // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_ + +// clang-format on diff --git a/libs/profiling/include/profiling/profiler.h b/libs/profiling/include/profiling/profiler.h new file mode 100644 index 000000000..953042da3 --- /dev/null +++ b/libs/profiling/include/profiling/profiler.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + +// NOTE This header is derived from the following file (in TensorFlow v1.12) +// 'externals/tensorflow/tensorflow/contrib/lite/profiling/profiler.h +#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_ +#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_ + +#include <vector> + +#include "profiling/profile_buffer.h" + +#ifdef TFLITE_PROFILING_ENABLED + +namespace tflite { +namespace profiling { +class ScopedProfile; +class ScopedOperatorProfile; + +// Controls whether profiling is enabled or disabled and collects profiles. +// TFLite is used on platforms that don't have posix threads, so the profiler is +// kept as simple as possible. It is designed to be used only on a single +// thread. +// +// Profiles are collected using Scoped*Profile objects that begin and end a +// profile event. +// An example usage is shown in the example below: +// +// Say Worker class has a DoWork method and we are interested in profiling +// the overall execution time for DoWork and time spent in Task1 and Task2 +// functions. +// +// class Worker { +// public: +// void DoWork() { +// ScopedProfile(&controller, "DoWork"); +// Task1(); +// Task2(); +// ..... +// } +// +// void Task1() { +// ScopedProfile(&controller, "Task1"); +// .... +// } +// +// void Task2() { +// ScopedProfile(&controller, "Task2"); +// } +// +// Profiler profiler; +// } +// +// We instrument the functions that need to be profiled. +// +// Profile can be collected by enable profiling and then getting profile +// events. +// +// void ProfileWorker() { +// Worker worker; +// worker.profiler.EnableProfiling(); +// worker.DoWork(); +// worker.profiler.DisableProfiling(); +// // Profiling is complete, extract profiles. +// auto profile_events = worker.profiler.GetProfiles(); +// } +// +// +class Profiler { + public: + Profiler() : buffer_(1024, false) {} + + void StartProfiling() { buffer_.SetEnabled(true); } + void StopProfiling() { buffer_.SetEnabled(false); } + void Reset() { buffer_.Reset(); } + std::vector<const ProfileEvent*> GetProfileEvents() { + std::vector<const ProfileEvent*> profile_events; + profile_events.reserve(buffer_.Size()); + for (size_t i = 0; i < buffer_.Size(); i++) { + profile_events.push_back(buffer_.At(i)); + } + return profile_events; + } + + private: + friend class ScopedProfile; + friend class ScopedOperatorProfile; + ProfileBuffer* GetProfileBuffer() { return &buffer_; } + ProfileBuffer buffer_; +}; + +class ScopedProfile { + public: + // Adds a profile event to profile that begins with the construction + // of object and ends when the object goes out of scope. + // The lifetime of tag should be at least the lifetime of profiler. + + ScopedProfile(Profiler* profiler, const char* tag) + : buffer_(nullptr), event_handle_(0) { + if (profiler) { + buffer_ = profiler->GetProfileBuffer(); + event_handle_ = + buffer_->BeginEvent(tag, ProfileEvent::EventType::DEFAULT, 0); + } + } + ~ScopedProfile() { + if (buffer_) { + buffer_->EndEvent(event_handle_); + } + } + + private: + ProfileBuffer* buffer_; + int32_t event_handle_; +}; + +class ScopedOperatorProfile { + public: + // Adds a profile event to profile that begins with the construction + // of object and ends when the object goes out of scope. + // The lifetime of tag should be at least the lifetime of profiler. + ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index) + : buffer_(nullptr), event_handle_(0) { + if (profiler) { + buffer_ = profiler->GetProfileBuffer(); + event_handle_ = buffer_->BeginEvent( + tag, ProfileEvent::EventType::OPERATOR_INVOKE_EVENT, node_index); + } + } + + ~ScopedOperatorProfile() { + if (buffer_) { + buffer_->EndEvent(event_handle_); + } + } + + private: + ProfileBuffer* buffer_; + int32_t event_handle_; +}; + +} // namespace profiling +} // namespace tflite + +#define VARNAME_UNIQ(name, ctr) name##ctr + +#define SCOPED_OPERATOR_PROFILE(profiler, node_index) \ + tflite::profiling::ScopedOperatorProfile VARNAME_UNIQ( \ + _profile_, __COUNTER__)((profiler), "OpInvoke", (node_index)) +#else + +namespace tflite { +namespace profiling { +// A noop version of profiler when profiling is disabled. +class Profiler { + public: + Profiler() {} + void StartProfiling() {} + void StopProfiling() {} + void Reset() {} + std::vector<const ProfileEvent*> GetProfileEvents() { return {}; } +}; +} // namespace profiling +} // namespace tflite + +#define SCOPED_OPERATOR_PROFILE(profiler, node_index) + +#endif // TFLITE_PROFILING_ENABLED + +#endif // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_ + +// clang-format on diff --git a/libs/profiling/include/profiling/profiling.h b/libs/profiling/include/profiling/profiling.h new file mode 100644 index 000000000..ee0df1338 --- /dev/null +++ b/libs/profiling/include/profiling/profiling.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_MISC_PROFILING_H__ +#define __NNFW_MISC_PROFILING_H__ + +#include <iostream> + +namespace tflite +{ +namespace profiling +{ +class Profiler; // forward declaration +} +} + +namespace profiling +{ + +class Context +{ +public: + Context() : _sync(false), _profiler(nullptr) {} + +public: + const bool &sync(void) const { return _sync; } + tflite::profiling::Profiler *getProfiler() { return _profiler; } + void setProfiler(tflite::profiling::Profiler *p) { _profiler = p; } + void setSync(void) { _sync = true; } + +private: + bool _sync; + tflite::profiling::Profiler *_profiler; + +public: + static Context &get(void) + { + static Context ctx{}; + return ctx; + } +}; + +} // namespace profiling +#endif // __NNFW_MISC_PROFILING_H__ diff --git a/libs/profiling/include/profiling/time.h b/libs/profiling/include/profiling/time.h new file mode 100644 index 000000000..4b194944d --- /dev/null +++ b/libs/profiling/include/profiling/time.h @@ -0,0 +1,35 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + +// NOTE This header is derived from the following file (in TensorFlow v1.12) +// 'externals/tensorflow/tensorflow/contrib/lite/profiling/time.h +#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_ +#define TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_ + +#include <cstdint> + +namespace tflite { +namespace profiling { +namespace time { +uint64_t NowMicros(); +} // namespace time +} // namespace profiling +} // namespace tflite +#endif // TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_ + +// clang-format on diff --git a/libs/profiling/src/profiling/time.cpp b/libs/profiling/src/profiling/time.cpp new file mode 100644 index 000000000..92d8595f8 --- /dev/null +++ b/libs/profiling/src/profiling/time.cpp @@ -0,0 +1,55 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + +// NOTE This header is derived from the following file (in TensorFlow v1.12) +// 'externals/tensorflow/tensorflow/contrib/lite/profiling/time.cpp +#include "profiling/time.h" + +#if defined(_MSC_VER) +#include <chrono> // NOLINT(build/c++11) +#else +#include <sys/time.h> +#endif + +namespace tflite { +namespace profiling { +namespace time { + +#if defined(_MSC_VER) + +uint64_t NowMicros() { + return std::chrono::duration_cast<std::chrono::microseconds>( + std::chrono::system_clock::now().time_since_epoch()) + .count(); +} + +#else + +uint64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; +} + +#endif // defined(_MSC_VER) + +} // namespace time +} // namespace profiling +} // namespace tflite + +// clang-format on diff --git a/libs/support/CMakeLists.txt b/libs/support/CMakeLists.txt deleted file mode 100644 index c91677266..000000000 --- a/libs/support/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_subdirectory(tflite) -add_subdirectory(nnapi) diff --git a/libs/support/nnapi/CMakeLists.txt b/libs/support/nnapi/CMakeLists.txt deleted file mode 100644 index 193bcbd4e..000000000 --- a/libs/support/nnapi/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -file(GLOB_RECURSE SOURCES "src/*.cpp") - -add_library(nnfw_support_nnapi ${SOURCES}) -set_property(TARGET nnfw_support_nnapi PROPERTY POSITION_INDEPENDENT_CODE ON) -target_include_directories(nnfw_support_nnapi PUBLIC ${CMAKE_SOURCE_DIR}/include) -target_link_libraries(nnfw_support_nnapi static_nnfw_util) diff --git a/libs/support/nnapi/src/Utils.cpp b/libs/support/nnapi/src/Utils.cpp deleted file mode 100644 index ae1076fd1..000000000 --- a/libs/support/nnapi/src/Utils.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include "support/nnapi/Utils.h" - -#include <cassert> - -namespace nnfw -{ -namespace support -{ -namespace nnapi -{ - -const char *to_string(const PaddingCode &code) -{ - assert((ANEURALNETWORKS_PADDING_SAME == code) || (ANEURALNETWORKS_PADDING_VALID == code)); - - switch (code) - { - case ANEURALNETWORKS_PADDING_SAME: - return "ANEURALNETWORKS_PADDING_SAME"; - case ANEURALNETWORKS_PADDING_VALID: - return "ANEURALNETWORKS_PADDING_VALID"; - } - - return nullptr; -} - -} // namespace nnapi -} // namespace support -} // namespace nnfw diff --git a/libs/support/tflite/CMakeLists.txt b/libs/support/tflite/CMakeLists.txt deleted file mode 100644 index 667b3bc11..000000000 --- a/libs/support/tflite/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -file(GLOB_RECURSE SOURCES "src/*.cpp") -file(GLOB_RECURSE TESTS "src/*.test.cpp") -list(REMOVE_ITEM SOURCES ${TESTS}) - -add_library(nnfw_support_tflite STATIC ${SOURCES}) -set_target_properties(nnfw_support_tflite PROPERTIES POSITION_INDEPENDENT_CODE ON) -target_include_directories(nnfw_support_tflite PUBLIC ${CMAKE_SOURCE_DIR}/include) -target_link_libraries(nnfw_support_tflite tensorflow-lite ${LIB_PTHREAD} dl) -target_link_libraries(nnfw_support_tflite static_nnfw_util) - -add_executable(nnfw_support_tflite_test_TensorView src/TensorView.test.cpp) -target_link_libraries(nnfw_support_tflite_test_TensorView nnfw_support_tflite) diff --git a/libs/support/tflite/src/TensorView.test.cpp b/libs/support/tflite/src/TensorView.test.cpp deleted file mode 100644 index 1d3a70500..000000000 --- a/libs/support/tflite/src/TensorView.test.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "support/tflite/TensorView.h" - -#include <cassert> - -void int_test(void) -{ - int value[6] = {1, 2, 3, 4, 5, 6}; - - const nnfw::util::tensor::Shape shape{2, 3}; - const nnfw::support::tflite::TensorView<int> view{shape, value}; - - assert(view.at(nnfw::util::tensor::Index{0, 0}) == 1); - assert(view.at(nnfw::util::tensor::Index{0, 1}) == 2); - assert(view.at(nnfw::util::tensor::Index{0, 2}) == 3); - assert(view.at(nnfw::util::tensor::Index{1, 0}) == 4); - assert(view.at(nnfw::util::tensor::Index{1, 1}) == 5); - assert(view.at(nnfw::util::tensor::Index{1, 2}) == 6); -} - -int main(int argc, char **argv) -{ - float value[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - - const nnfw::util::tensor::Shape shape{2, 3}; - const nnfw::support::tflite::TensorView<float> view{shape, value}; - - assert(view.at(nnfw::util::tensor::Index{0, 0}) == 1.0f); - assert(view.at(nnfw::util::tensor::Index{0, 1}) == 2.0f); - assert(view.at(nnfw::util::tensor::Index{0, 2}) == 3.0f); - assert(view.at(nnfw::util::tensor::Index{1, 0}) == 4.0f); - assert(view.at(nnfw::util::tensor::Index{1, 1}) == 5.0f); - assert(view.at(nnfw::util::tensor::Index{1, 2}) == 6.0f); - - int_test(); - - return 0; -} diff --git a/libs/support/tflite/src/kernels/RSQRT.cpp b/libs/support/tflite/src/kernels/RSQRT.cpp deleted file mode 100644 index 13efe0ed9..000000000 --- a/libs/support/tflite/src/kernels/RSQRT.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "support/tflite/kernels/RSQRT.h" -#include "tensorflow/contrib/lite/kernels/kernel_util.h" - -#include <cmath> -#include <iostream> - -namespace tflite -{ -namespace ops -{ -namespace custom -{ -namespace nnfw -{ -namespace RSQRT -{ - -void *InitRSQRT(TfLiteContext *context, const char *buffer, size_t length) { return nullptr; } - -void FreeRSQRT(TfLiteContext *context, void *buffer) {} - -TfLiteStatus PrepareRSQRT(TfLiteContext *context, TfLiteNode *node) -{ - TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor *input = GetInput(context, node, 0); - TfLiteTensor *output = GetOutput(context, node, 0); - TF_LITE_ENSURE_EQ(context, input->type, output->type); - // Quantized float is not supported yet. - TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); - return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims)); -} - -inline TfLiteStatus Eval(TfLiteContext *context, TfLiteNode *node, float float_func(float)) -{ - const TfLiteTensor *input = GetInput(context, node, 0); - TfLiteTensor *output = GetOutput(context, node, 0); - switch (input->type) - { - case kTfLiteFloat32: - { - size_t elements = NumElements(input); - const float *in = input->data.f; - const float *in_end = in + elements; - float *out = output->data.f; - for (; in < in_end; in++, out++) - *out = float_func(*in); - return kTfLiteOk; - } - default: - { - context->ReportError(context, "Input type is %d, requires float32", input->type); - return kTfLiteError; - } - } -} - -TfLiteStatus EvalRSQRT(TfLiteContext *context, TfLiteNode *node) -{ - return Eval(context, node, [](float f) { return 1.f / std::sqrt(f); }); -} - -} // namespace RSQRT -} // namespace nnfw -} // namespace custom -} // namespace ops -} // namespace tflite diff --git a/libs/support/tflite/src/nnapi_delegate.cpp b/libs/support/tflite/src/nnapi_delegate.cpp deleted file mode 100644 index 1eada4bca..000000000 --- a/libs/support/tflite/src/nnapi_delegate.cpp +++ /dev/null @@ -1,720 +0,0 @@ -/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// NOTE To minimize diff with upstream tensorflow, disable clang-format -// clang-format off - -// NOTE This code is derived from the following file (in TensorFlow) -// 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.cc' -#include "support/tflite/nnapi_delegate.h" -#include <fcntl.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <sys/types.h> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/error_reporter.h" -#include "tensorflow/contrib/lite/model.h" -#include "NeuralNetworksShim.h" -#include "NeuralNetworksExShim.h" - -#ifdef __ANDROID__ -#include <sys/system_properties.h> -#endif - -namespace nnfw -{ - -// TODO(aselle): FATAL leaves resources hanging. -void FATAL(const char* format, ...) { - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - fflush(stderr); - exit(1); -} - -// TODO(aselle): Change the error model to use status codes. -#define CHECK_TFLITE_SUCCESS(x) \ - if (x != kTfLiteOk) { \ - FATAL("Aborting since tflite returned failure."); \ - } - -#define CHECK_NN(x) \ - if (x != ANEURALNETWORKS_NO_ERROR) { \ - FATAL("Aborting since tflite returned failure."); \ - } - -namespace { - -int32_t GetAndroidSdkVersion() { -#ifdef __ANDROID__ - const char* sdkProp = "ro.build.version.sdk"; - char sdkVersion[PROP_VALUE_MAX]; - int length = __system_property_get(sdkProp, sdkVersion); - if (length != 0) { - for (int i = 0; i < length; ++i) { - int digit = sdkVersion[i] - '0'; - if (digit < 0 || digit > 9) { - // Non-numeric SDK version, assume it's higher then expected; - return 0xFFFF; - } - } - return atoi(sdkVersion); - } - FATAL("No %s prop", sdkProp); -#endif // __ANDROID__ - return 0; -} - -static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion(); - -} // namespace - -NNAPIAllocation::NNAPIAllocation(const char* filename, - ::tflite::ErrorReporter* error_reporter) - : MMAPAllocation(filename, error_reporter) { - if (mmapped_buffer_ != MAP_FAILED) - CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ, - mmap_fd_, 0, &handle_)); -} - -NNAPIAllocation::~NNAPIAllocation() { - if (handle_) { - ANeuralNetworksMemory_free(handle_); - } -} - -NNAPIDelegate::~NNAPIDelegate() { - if (nn_compiled_model_) { - ANeuralNetworksCompilation_free(nn_compiled_model_); - nn_compiled_model_ = nullptr; - } - if (nn_model_) { - ANeuralNetworksModel_free(nn_model_); - nn_model_ = nullptr; - // TODO(aselle): Is this thread-safe and callable multiple times? - } - // ANeuralNetworksShutdown(); -} - -// Adds the tensors of the interpreter to the NN API model. -// Returns the number of operands added. -uint32_t addTensorOperands(tflite::Interpreter* interpreter, - ANeuralNetworksModel* nn_model, - const std::vector<uint32_t>& skip_list) { - uint32_t next_id = 0; - for (size_t i = 0; i < interpreter->tensors_size(); i++) { - // skip temporaries tensors. - bool shouldSkip = false; - for (auto skip_idx : skip_list) { - if (i == skip_idx) { - shouldSkip = true; - break; - } - } - if (shouldSkip) continue; - - int32_t nn_type = 0; - // NNAPI requires 32-bit float scale to be zero, tflite doesn't care - float scale = 0.0f; - int32_t zeroPoint = 0; - TfLiteTensor* tensor = interpreter->tensor(i); - switch (tensor->type) { - case kTfLiteNoType: - // Tensors added during initialization of Ops don't have a type yet and - // should not be registered with the NNAPI. - continue; - case kTfLiteFloat32: - nn_type = ANEURALNETWORKS_TENSOR_FLOAT32; - break; - case kTfLiteUInt8: - nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM; - scale = tensor->params.scale; - // FIXME The next line is a workaround because currently zero scale is - // passed down from TF - // Lite. Note that the latest NeuralNetworks.h (see - // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/runtime/include/NeuralNetworks.h) - // requires scale to be greater than zero. Remove this workaround - // when the scale - // value is correctly passed. - scale = (scale == 0.0f) ? 1.0f : scale; - zeroPoint = tensor->params.zero_point; - break; - case kTfLiteInt32: - nn_type = ANEURALNETWORKS_TENSOR_INT32; - scale = tensor->params.scale; - zeroPoint = tensor->params.zero_point; - break; - default: - FATAL("Unsupported type."); - } - // TODO(aselle): Note, many of these are intermediate results. Do I need - // to ever specify these sizes. I am currently below doing setValue - // on all of them, but I shouldn't in the future. - // Answer(jeanluc): If all the operators can set the dimension correctly, - // you won't need to. - ANeuralNetworksOperandType operand_type{ - nn_type, static_cast<uint32_t>(tensor->dims->size), - reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint}; - CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); - // TODO(aselle): Based on Michael's suggestion, limiting this to read - // only memory - if (tensor->allocation_type == kTfLiteMmapRo) { - if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>( - static_cast<const ::tflite::Allocation*>(tensor->allocation))) { - CHECK_NN(ANeuralNetworksModel_setOperandValueFromMemory( - nn_model, next_id, alloc->memory(), alloc->offset(tensor->data.raw), - tensor->bytes)); - } else { - CHECK_NN(ANeuralNetworksModel_setOperandValue( - nn_model, next_id, tensor->data.raw, tensor->bytes)); - } - } else if (tensor->bytes == 0) { - // These size 0 tensors are optional tensors reserved. - CHECK_NN( - ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0)); - } - - ++next_id; - } - return next_id; -} - -// Adds the operations and their parameters to the NN API model. -// 'next-id' is the operand ID of the next operand of the model. -void AddOpsAndParams(tflite::Interpreter* interpreter, - ANeuralNetworksModel* nn_model, uint32_t next_id, - std::vector<int>* model_state_inputs, - std::vector<int>* model_state_outputs) { - for (size_t i = 0; i < interpreter->nodes_size(); i++) { - const auto* node_and_registration = interpreter->node_and_registration(i); - const TfLiteNode& node = node_and_registration->first; - const TfLiteRegistration& registration = node_and_registration->second; - tflite::BuiltinOperator builtin = - static_cast<tflite::BuiltinOperator>(registration.builtin_code); - - // Add the parameters. - std::vector<uint32_t> augmented_inputs( - node.inputs->data, node.inputs->data + node.inputs->size); - std::vector<uint32_t> augmented_outputs( - node.outputs->data, node.outputs->data + node.outputs->size); - - auto add_scalar_int32 = [&nn_model, &augmented_inputs, - &next_id](int value) { - ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32}; - CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) - CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value, - sizeof(int32_t))) - augmented_inputs.push_back(next_id++); - }; - - auto add_scalar_float32 = [&nn_model, &augmented_inputs, - &next_id](float value) { - ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32}; - CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) - CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value, - sizeof(float))) - augmented_inputs.push_back(next_id++); - }; - - // Handle state tensors of RNN, LSTM, SVDF. - // For each state_out tensor, a corresponding state_in operand needs to be - // created for NNAPI. - auto duplicate_state_tensor_float32 = - [interpreter, &nn_model, &next_id, &augmented_inputs, - &model_state_inputs, &model_state_outputs](int tensor_id) { - const TfLiteTensor* tensor = interpreter->tensor(tensor_id); - ANeuralNetworksOperandType operand_type{ - ANEURALNETWORKS_TENSOR_FLOAT32, - static_cast<uint32_t>(tensor->dims->size), - reinterpret_cast<uint32_t*>(tensor->dims->data), - tensor->params.scale, tensor->params.zero_point}; - CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); - augmented_inputs.push_back(next_id); - model_state_inputs->push_back(next_id); - model_state_outputs->push_back(tensor_id); - next_id++; - }; - - auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); }; - - auto add_pooling_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLitePoolParams*>(data); - add_scalar_int32(builtin->padding); - add_scalar_int32(builtin->stride_width); - add_scalar_int32(builtin->stride_height); - add_scalar_int32(builtin->filter_width); - add_scalar_int32(builtin->filter_height); - add_scalar_int32(builtin->activation); - }; - - auto add_convolution_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteConvParams*>(data); - add_scalar_int32(builtin->padding); - add_scalar_int32(builtin->stride_width); - add_scalar_int32(builtin->stride_height); - add_scalar_int32(builtin->activation); - }; - - auto add_depthwise_conv_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data); - add_scalar_int32(builtin->padding); - add_scalar_int32(builtin->stride_width); - add_scalar_int32(builtin->stride_height); - add_scalar_int32(builtin->depth_multiplier); - add_scalar_int32(builtin->activation); - }; - - auto add_fully_connected_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data); - add_scalar_int32(builtin->activation); - }; - - auto add_concatenation_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data); - add_scalar_int32(builtin->axis); - if (builtin->activation != kTfLiteActNone) { - FATAL("Concatenation does not support fused activation in NNAPI"); - } - }; - - auto add_softmax_params = [&add_scalar_float32](void* data) { - auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data); - add_scalar_float32(builtin->beta); - }; - - auto add_space_to_depth_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data); - add_scalar_int32(builtin->block_size); - }; - - auto add_lstm_params = [&add_scalar_int32, - &add_scalar_float32](void* data) { - auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data); - add_scalar_int32(builtin->activation); - add_scalar_float32(builtin->cell_clip); - add_scalar_float32(builtin->proj_clip); - }; - - // LSTM in NNAPI requires scratch tensor as an output operand. - auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model, - &next_id, &augmented_outputs]() { - int scratch_buffer_index = node.temporaries->data[0]; - const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index); - ANeuralNetworksOperandType operand_type{ - ANEURALNETWORKS_TENSOR_FLOAT32, - static_cast<uint32_t>(tensor->dims->size), - reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale, - tensor->params.zero_point}; - CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); - augmented_outputs.insert(augmented_outputs.begin(), next_id++); - }; - - auto add_mean_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteMeanParams*>(data); - add_scalar_int32(builtin->keep_dims); - }; - - auto add_svdf_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data); - add_scalar_int32(builtin->rank); - add_scalar_int32(builtin->activation); - }; - - auto add_rnn_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteRNNParams*>(data); - add_scalar_int32(builtin->activation); - }; - - // Handle optional input tensors. - auto add_optional_tensors = [&nn_model, &augmented_inputs, - &next_id](int nn_type) { - for (size_t idx = 0; idx < augmented_inputs.size(); idx++) { - if (augmented_inputs[idx] == kOptionalTensor) { - const std::vector<uint32_t> dim = {0, 0}; - ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0}; - CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) - CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, - nullptr, 0)) - augmented_inputs[idx] = next_id++; - } - } - }; - - int nnapi_version = 10; -#include "nnapi_delegate_ex_AddOpsAndParams_lambda.inc" - - ANeuralNetworksOperationType nn_op_type; - - switch (builtin) { - case tflite::BuiltinOperator_ADD: - nn_op_type = ANEURALNETWORKS_ADD; - add_add_params(); - break; - case tflite::BuiltinOperator_MUL: - nn_op_type = ANEURALNETWORKS_MUL; - add_add_params(); - break; - case tflite::BuiltinOperator_AVERAGE_POOL_2D: - add_pooling_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D; - break; - case tflite::BuiltinOperator_MAX_POOL_2D: - add_pooling_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_MAX_POOL_2D; - break; - case tflite::BuiltinOperator_L2_POOL_2D: - add_pooling_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_L2_POOL_2D; - break; - case tflite::BuiltinOperator_CONV_2D: - add_convolution_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_CONV_2D; - break; - case tflite::BuiltinOperator_RELU: - nn_op_type = ANEURALNETWORKS_RELU; - break; - case tflite::BuiltinOperator_RELU_N1_TO_1: - nn_op_type = ANEURALNETWORKS_RELU1; - break; - case tflite::BuiltinOperator_RELU6: - nn_op_type = ANEURALNETWORKS_RELU6; - break; - case tflite::BuiltinOperator_TANH: - nn_op_type = ANEURALNETWORKS_TANH; - break; - case tflite::BuiltinOperator_FLOOR: - nn_op_type = ANEURALNETWORKS_FLOOR; - break; - case tflite::BuiltinOperator_LOGISTIC: - nn_op_type = ANEURALNETWORKS_LOGISTIC; - break; - case tflite::BuiltinOperator_DEPTHWISE_CONV_2D: - add_depthwise_conv_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D; - break; - case tflite::BuiltinOperator_CONCATENATION: - add_concatenation_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_CONCATENATION; - break; - case tflite::BuiltinOperator_SOFTMAX: - add_softmax_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_SOFTMAX; - break; - case tflite::BuiltinOperator_FULLY_CONNECTED: - add_fully_connected_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED; - break; - case tflite::BuiltinOperator_RESHAPE: - nn_op_type = ANEURALNETWORKS_RESHAPE; - // add_reshape_params(node.builtin_data); - break; - case tflite::BuiltinOperator_RESIZE_BILINEAR: - add_resize_bilinear_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR; - break; - case tflite::BuiltinOperator_SPACE_TO_DEPTH: - add_space_to_depth_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH; - break; - case tflite::BuiltinOperator_LSTM: { - duplicate_state_tensor_float32( - node.outputs->data[/*kOutputStateTensor*/ 0]); - duplicate_state_tensor_float32( - node.outputs->data[/*kCellStateTensor*/ 1]); - add_lstm_params(node.builtin_data); - add_lstm_scratch_tensor_float32(); - add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32); - nn_op_type = ANEURALNETWORKS_LSTM; - break; - } - case tflite::BuiltinOperator_DEQUANTIZE: - nn_op_type = ANEURALNETWORKS_DEQUANTIZE; - break; - case tflite::BuiltinOperator_SVDF: { - duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]); - add_svdf_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_SVDF; - break; - } - case tflite::BuiltinOperator_RNN: { - duplicate_state_tensor_float32( - node.outputs->data[/*kHiddenStateTensor*/ 0]); - add_rnn_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_RNN; - break; - } - case tflite::BuiltinOperator_EMBEDDING_LOOKUP: - nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP; - break; - case tflite::BuiltinOperator_PAD: - nnapi_version = 11; // require NNAPI 1.1 - nn_op_type = ANEURALNETWORKS_PAD; - break; - case tflite::BuiltinOperator_MEAN: - nnapi_version = 11; // require NNAPI 1.1 - add_mean_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_MEAN; - break; - case tflite::BuiltinOperator_DIV: - nnapi_version = 11; // require NNAPI 1.1 - nn_op_type = ANEURALNETWORKS_DIV; - add_add_params(); - break; - case tflite::BuiltinOperator_SUB: - nnapi_version = 11; // require NNAPI 1.1 - nn_op_type = ANEURALNETWORKS_SUB; - add_add_params(); - break; - case tflite::BuiltinOperator_STRIDED_SLICE: - add_strided_slice_params(node.builtin_data); - nn_op_type = ANEURALNETWORKS_STRIDED_SLICE; - break; - case tflite::BuiltinOperator_CAST: - CHECK_NN(ANeuralNetworksModel_addOperationEx( - nn_model, ANEURALNETWORKS_CAST_EX, - static_cast<uint32_t>(augmented_inputs.size()), - augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), - reinterpret_cast<uint32_t*>(node.outputs->data))); - continue; - case tflite::BuiltinOperator_TOPK_V2: - CHECK_NN(ANeuralNetworksModel_addOperationEx( - nn_model, ANEURALNETWORKS_TOPK_V2_EX, - static_cast<uint32_t>(augmented_inputs.size()), - augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), - reinterpret_cast<uint32_t*>(node.outputs->data))); - continue; - case tflite::BuiltinOperator_GATHER: - add_gather_ex_params(node.builtin_data); - CHECK_NN(ANeuralNetworksModel_addOperationEx( - nn_model, ANEURALNETWORKS_GATHER_EX, - static_cast<uint32_t>(augmented_inputs.size()), - augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), - reinterpret_cast<uint32_t*>(node.outputs->data))); - continue; - case tflite::BuiltinOperator_SPLIT: - CHECK_NN(ANeuralNetworksModel_addOperationEx( - nn_model, ANEURALNETWORKS_SPLIT_EX, - static_cast<uint32_t>(augmented_inputs.size()), - augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), - reinterpret_cast<uint32_t*>(node.outputs->data))); - continue; - case tflite::BuiltinOperator_TRANSPOSE: - nn_op_type = ANEURALNETWORKS_TRANSPOSE; - // param is almost same as reshape - break; - case tflite::BuiltinOperator_CONCAT_EMBEDDINGS: - case tflite::BuiltinOperator_LSH_PROJECTION: - case tflite::BuiltinOperator_HASHTABLE_LOOKUP: - case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN: - case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: - case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: - case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM: - case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: - case tflite::BuiltinOperator_L2_NORMALIZATION: - case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: - case tflite::BuiltinOperator_PADV2: - case tflite::BuiltinOperator_CALL: - case tflite::BuiltinOperator_SKIP_GRAM: - case tflite::BuiltinOperator_SPACE_TO_BATCH_ND: - case tflite::BuiltinOperator_BATCH_TO_SPACE_ND: - case tflite::BuiltinOperator_SQUEEZE: - case tflite::BuiltinOperator_EXP: - case tflite::BuiltinOperator_LOG_SOFTMAX: - case tflite::BuiltinOperator_DELEGATE: - case tflite::BuiltinOperator_PRELU: - case tflite::BuiltinOperator_MAXIMUM: - case tflite::BuiltinOperator_MINIMUM: - case tflite::BuiltinOperator_ARG_MAX: - case tflite::BuiltinOperator_GREATER: - case tflite::BuiltinOperator_GREATER_EQUAL: - case tflite::BuiltinOperator_LESS: - case tflite::BuiltinOperator_LESS_EQUAL: - case tflite::BuiltinOperator_NEG: - case tflite::BuiltinOperator_SELECT: - case tflite::BuiltinOperator_SLICE: - case tflite::BuiltinOperator_SIN: - case tflite::BuiltinOperator_TRANSPOSE_CONV: - case tflite::BuiltinOperator_SPARSE_TO_DENSE: - FATAL("Op code %d is currently not delegated to NNAPI", builtin); - nn_op_type = -1; // set to invalid - break; - case tflite::BuiltinOperator_CUSTOM: - std::string custom_name(registration.custom_name); - if (custom_name.compare("TensorFlowMax") == 0) { - CHECK_NN(ANeuralNetworksModel_addOperationEx( - nn_model, ANEURALNETWORKS_TENSORFLOW_MAX_EX, - static_cast<uint32_t>(augmented_inputs.size()), - augmented_inputs.data(), - static_cast<uint32_t>(node.outputs->size), - reinterpret_cast<uint32_t*>(node.outputs->data))); - continue; - } - else if (custom_name.compare("RSQRT") == 0) { - CHECK_NN(ANeuralNetworksModel_addOperationEx( - nn_model, ANEURALNETWORKS_RSQRT_EX, - static_cast<uint32_t>(augmented_inputs.size()), - augmented_inputs.data(), - static_cast<uint32_t>(node.outputs->size), - reinterpret_cast<uint32_t*>(node.outputs->data))); - continue; - } - else if (custom_name.compare("SquaredDifference") == 0) { - CHECK_NN(ANeuralNetworksModel_addOperationEx( - nn_model, ANEURALNETWORKS_SQUARED_DIFFERENCE_EX, - static_cast<uint32_t>(augmented_inputs.size()), - augmented_inputs.data(), - static_cast<uint32_t>(node.outputs->size), - reinterpret_cast<uint32_t*>(node.outputs->data))); - continue; - } - - FATAL("Custom operations are not supported when using NNAPI."); - nn_op_type = -1; // set to invalid - break; - } - - //if (nnapi_version == 11 && kAndroidSdkVersion < 28) { - // FATAL("Op %d needs NNAPI1.1", builtin); - //} - - // Add the operation. - CHECK_NN(ANeuralNetworksModel_addOperation( - nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()), - augmented_inputs.data(), - static_cast<uint32_t>(augmented_outputs.size()), - reinterpret_cast<uint32_t*>(augmented_outputs.data()))); - } -} - -TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) { - // TODO(aselle): This is not correct. need to handle resize invalidation. - if (nn_model_ && nn_compiled_model_) return kTfLiteOk; - - if (!nn_model_) { - CHECK_NN(ANeuralNetworksModel_create(&nn_model_)); - - // Find all the temporary tensors and put them in a skip_list. - std::vector<uint32_t> skip_list; - for (size_t i = 0; i < interpreter->nodes_size(); i++) { - const auto* node_and_registration = interpreter->node_and_registration(i); - const TfLiteNode& node = node_and_registration->first; - if (node.temporaries != nullptr) { - for (int j = 0; j < node.temporaries->size; j++) { - skip_list.push_back(static_cast<uint32_t>(node.temporaries->data[j])); - } - } - } - - uint32_t next_id = addTensorOperands(interpreter, nn_model_, skip_list); - AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_, - &model_states_outputs_); - - std::vector<int> augmented_inputs = interpreter->inputs(); - std::vector<int> augmented_outputs = interpreter->outputs(); - - // All state tensors input/output need to be treated as model input/output. - augmented_inputs.insert(augmented_inputs.end(), - model_states_inputs_.begin(), - model_states_inputs_.end()); - augmented_outputs.insert(augmented_outputs.end(), - model_states_outputs_.begin(), - model_states_outputs_.end()); - - CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs( - nn_model_, static_cast<uint32_t>(augmented_inputs.size()), - reinterpret_cast<const uint32_t*>(augmented_inputs.data()), - static_cast<uint32_t>(augmented_outputs.size()), - reinterpret_cast<const uint32_t*>(augmented_outputs.data()))); - CHECK_NN(ANeuralNetworksModel_finish(nn_model_)); - } - if (!nn_compiled_model_) { - CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_)); - CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_)); - } - return kTfLiteOk; -} - -TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) { - if (!nn_model_) { - TF_LITE_ENSURE_STATUS(BuildGraph(interpreter)); - } - - ANeuralNetworksExecution* execution = nullptr; - CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution)); - - // Currently perform deep copy of input buffer - for (size_t i = 0; i < interpreter->inputs().size(); i++) { - int input = interpreter->inputs()[i]; - // TODO(aselle): Is this what we want or do we want input instead? - // TODO(aselle): This should be called setInputValue maybe to be cons. - TfLiteTensor* tensor = interpreter->tensor(input); - CHECK_NN(ANeuralNetworksExecution_setInput( - execution, i, nullptr, tensor->data.raw, tensor->bytes)); - } - - // Tell nn api where to place final data. - for (size_t i = 0; i < interpreter->outputs().size(); i++) { - int output = interpreter->outputs()[i]; - TfLiteTensor* tensor = interpreter->tensor(output); - CHECK_NN(ANeuralNetworksExecution_setOutput( - execution, i, nullptr, tensor->data.raw, tensor->bytes)); - } - - // The state_out of previous invocation need to be mapped to state_in of - // current invocation. - for (size_t i = 0; i < model_states_outputs_.size(); i++) { - int state_tensor_idx = model_states_outputs_[i]; - TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx); - // Here we are using a deep copy for state_in tensors so that we are not - // reading and writing into the same buffer during a invocation. - // TODO(miaowang): using double shared buffer to minimize the copies. - CHECK_NN(ANeuralNetworksExecution_setInput( - execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw, - tensor->bytes)); - // Tell NNAPI where to output the state_out. - CHECK_NN(ANeuralNetworksExecution_setOutput( - execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw, - tensor->bytes)); - } - - // Currently use blocking compute. - ANeuralNetworksEvent* event = nullptr; - CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event)); - CHECK_NN(ANeuralNetworksEvent_wait(event)); - ANeuralNetworksEvent_free(event); - ANeuralNetworksExecution_free(execution); - -#if 0 - printf("From the NN API:\n"); - TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]); - if (float* data = - interpreter->typed_tensor<float>(interpreter->outputs()[0])) { - size_t num = tensor->bytes / sizeof(float); - for (float* p = data; p < data + num; p++) { - printf(" %f", *p); - } - printf("\n"); - } -#endif - - return kTfLiteOk; -} - -} // namespace nnfw - -// clang-format on diff --git a/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc b/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc deleted file mode 100644 index ea485fe45..000000000 --- a/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc +++ /dev/null @@ -1,41 +0,0 @@ -// This file is included from AddOpsAndParams defined in nnapi_delegate.cc -// and contains lambda for extened implementation to original Tensorflow Lite. - auto add_resize_bilinear_params = [&add_scalar_int32, &interpreter, &augmented_inputs](void* data) { - auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(data); - if (builtin->align_corners) { - FATAL("Resize bilinear does not support align corners in NNAPI"); - } - - TfLiteTensor* tensor = interpreter->tensor(augmented_inputs.back()); - assert(tensor->type == kTfLiteInt32); - assert(tensor->bytes == sizeof(int)*2); - augmented_inputs.pop_back(); - - int height = ((int*)(tensor->data.raw))[1]; - int width = ((int*)(tensor->data.raw))[0]; - add_scalar_int32(height); - add_scalar_int32(width); - }; - - auto add_strided_slice_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(data); - add_scalar_int32(builtin->begin_mask); - add_scalar_int32(builtin->end_mask); - // ellipsis_mask and new_axis_mask are not supported on nn runtime - // cf) tflite interpreter supports both operations - if (builtin->ellipsis_mask) { - FATAL("STRIDE_SLICE does not support ellipsis_mask in NNAPI"); - } - if (builtin->new_axis_mask) { - FATAL("STRIDE_SLICE does not support new_axis_mask in NNAPI"); - } - add_scalar_int32(builtin->shrink_axis_mask); - }; - - auto add_gather_ex_params = [&add_scalar_int32](void* data) { - auto builtin = reinterpret_cast<TfLiteGatherParams*>(data); - add_scalar_int32(builtin->axis); - if (builtin->axis != 0) { - FATAL("GATHER does not support axis>0 in NNAPI"); - } - }; diff --git a/libs/tflite/CMakeLists.txt b/libs/tflite/CMakeLists.txt new file mode 100644 index 000000000..e844d1c68 --- /dev/null +++ b/libs/tflite/CMakeLists.txt @@ -0,0 +1,12 @@ +file(GLOB_RECURSE SOURCES "src/*.cpp") +file(GLOB_RECURSE TESTS "src/*.test.cpp") +list(REMOVE_ITEM SOURCES ${TESTS}) + +add_library(nnfw_lib_tflite STATIC ${SOURCES}) +set_target_properties(nnfw_lib_tflite PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_include_directories(nnfw_lib_tflite PUBLIC ${NNFW_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include) +target_link_libraries(nnfw_lib_tflite tensorflow-lite ${LIB_PTHREAD} dl) +target_link_libraries(nnfw_lib_tflite nnfw_lib_misc) + +add_executable(nnfw_lib_tflite_test_TensorView src/TensorView.test.cpp) +target_link_libraries(nnfw_lib_tflite_test_TensorView nnfw_lib_tflite) diff --git a/libs/tflite/include/tflite/Assert.h b/libs/tflite/include/tflite/Assert.h new file mode 100644 index 000000000..6d12d37f6 --- /dev/null +++ b/libs/tflite/include/tflite/Assert.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Assert.h + * @brief This file contains helper function of assertion + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_ASSERT_H__ +#define __NNFW_TFLITE_ASSERT_H__ + +#include "tensorflow/contrib/lite/context.h" + +#include <sstream> + +#define STR_DETAIL(value) #value +#define STR(value) STR_DETAIL(value) + +#define TFLITE_ENSURE(exp) \ + { \ + const TfLiteStatus status = (exp); \ + \ + if (status != kTfLiteOk) \ + { \ + std::ostringstream ss; \ + ss << #exp << " failed (" << __FILE__ << ":" << __LINE__ << ")"; \ + throw std::runtime_error{ss.str()}; \ + } \ + } + +#endif // __NNFW_TFLITE_ASSERT_H__ diff --git a/libs/tflite/include/tflite/Diff.h b/libs/tflite/include/tflite/Diff.h new file mode 100644 index 000000000..15c672831 --- /dev/null +++ b/libs/tflite/include/tflite/Diff.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Diff.h + * @brief This file contains classes for testing correctess of implementation + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_DIFF_H__ +#define __NNFW_TFLITE_DIFF_H__ + +#include "tensorflow/contrib/lite/interpreter.h" + +#include "misc/tensor/Index.h" +#include "misc/tensor/Diff.h" +#include "misc/tensor/Shape.h" +#include "misc/tensor/Comparator.h" + +#include "tflite/TensorView.h" + +#include <functional> +#include <vector> + +/** + * @brief Class to define TfLite interpreter match application + */ +class TfLiteInterpMatchApp +{ +public: + /** + * @brief Construct a new TfLiteInterpMatchApp object with Comparator + * @param[in] comparator Comparator object for tensor comparation + */ + TfLiteInterpMatchApp(const nnfw::misc::tensor::Comparator &comparator) + : _verbose{false}, _comparator(comparator) + { + // DO NOTHING + } + +public: + /** + * @brief Get reference verbose for debugging information + * @return Reference of verbose value + */ + int &verbose(void) { return _verbose; } + +private: + int _verbose; + +public: + /** + * @brief Run two interpreter and return the output matching + * @param[in] pure Interpreter object of expected(with TfLite) + * @param[in] nnapi Interpreter object of obtained(through NNAPI) + * @return @c true if two Interpreter results are same, otherwise @c false + */ + bool run(::tflite::Interpreter &pure, ::tflite::Interpreter &nnapi) const; + /** + * @brief Compare two TensorView values and return the match result + * @param[in] expected TensorView object to read expected values + * @param[in] obtained TensorView object to read obtained values + * @param[in] id Tensor ID value used for debug message + * @return @c true if two TensorView values are same, otherwise @c false + */ + template <typename T> + bool compareSingleTensorView(const nnfw::tflite::TensorView<T> &expected, + const nnfw::tflite::TensorView<T> &obtained, int id) const; + +private: + const nnfw::misc::tensor::Comparator &_comparator; +}; + +#include "tflite/interp/Builder.h" +#include "tflite/Quantization.h" + +#include <random> + +/** + * @brief Class to generate random values + */ +class RandomGenerator +{ +public: + /** + * @brief Construct a new RandomGenerator object + * @param[in] seed Random seed value + * @param[in] mean Mean value of normal random number generation + * @param[in] stddev Standard deviation of random number generation + * @param[in] quantization TfLiteQuantizationParams type to represent quantization value + * (not used yet) + */ + RandomGenerator(int seed, float mean, float stddev, + const TfLiteQuantizationParams quantization = make_default_quantization()) + : _rand{seed}, _dist{mean, stddev}, _quantization{quantization} + { + // DO NOTHING + } + +public: + /** + * @brief Generate random numbers for type T + * @param[in] s Shape value + * @param[in] i Index value + * @return Random generated value + * @note This is same as T generate(void) as two input parameters are not used + */ + template <typename T> + T generate(const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &) + { + return generate<T>(); + } + + /** + * @brief Generate random numbers for type T + * @return Random generated value + */ + template <typename T> T generate(void) { return _dist(_rand); } + +private: + std::minstd_rand _rand; + std::normal_distribution<float> _dist; + const TfLiteQuantizationParams _quantization; +}; + +template <> uint8_t RandomGenerator::generate<uint8_t>(void); + +/** + * @brief Structure for NNAPI correctness test + */ +struct RandomTestParam +{ + int verbose; //!< Verbosity of debug information + int tolerance; //!< Torlerance of value difference + int tensor_logging = 0; //!< Save logging to a file if not 0 + std::string log_path = ""; //!< Path of log file, meaningful only when tensor_logging is 1 +}; + +/** + * @brief Class to define Random test runner + */ +class RandomTestRunner +{ +public: + /** + * @brief Construct a new RandomTestRunner object + * @param[in] seed Random seed value + * @param[in] param RandomTestParam object for test runner + * @param[in] quantization TfLiteQuantizationParams type to represent quantization value + */ + RandomTestRunner(int seed, const RandomTestParam ¶m, + const TfLiteQuantizationParams quantization = make_default_quantization()) + : _randgen{seed, 0.0f, 2.0f, quantization}, _param{param} + { + // DO NOTHING + } + +public: + /** + * @brief Run the random test runner + * @param[in] builder Interpreter Builder used to run + * @return 0 if test succeeds, otherwise failure + */ + int run(const nnfw::tflite::Builder &builder); + +public: + /** + * @brief Get RandomGenerator reference + * @return RandomGenerator reference + */ + RandomGenerator &generator() { return _randgen; }; + +private: + RandomGenerator _randgen; + const RandomTestParam _param; + +public: + /** + * @brief Create a RandomTestRunner object + * @param[in] seed Random seed value + * @return RandomGenerator object + */ + static RandomTestRunner make(int seed); +}; + +#endif // __NNFW_TFLITE_DIFF_H__ diff --git a/libs/tflite/include/tflite/FeatureView.h b/libs/tflite/include/tflite/FeatureView.h new file mode 100644 index 000000000..06cbf4b14 --- /dev/null +++ b/libs/tflite/include/tflite/FeatureView.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file FeatureView.h + * @brief This file contains FeatureView class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_FEATURE_VIEW_H__ +#define __NNFW_TFLITE_FEATURE_VIEW_H__ + +#include "tensorflow/contrib/lite/interpreter.h" + +#include "tflite/InputIndex.h" +#include "tflite/OutputIndex.h" + +#include "misc/feature/Shape.h" +#include "misc/feature/Reader.h" + +namespace nnfw +{ +namespace tflite +{ + +template <typename T> class FeatureView; + +/** + * @brief Class to support reading element of float type feature + */ +template <> class FeatureView<float> : public nnfw::misc::feature::Reader<float> +{ +public: + /** + * @brief Construct a new FeatureView object + * @param[in] interp Interpreter to read from + * @param[in] index InputIndex index of input + */ + FeatureView(::tflite::Interpreter &interp, const InputIndex &index); + /** + * @brief Construct a new FeatureView object + * @param[in] interp Interpreter to read from + * @param[in] index OutputIndex index of output + */ + FeatureView(::tflite::Interpreter &interp, const OutputIndex &index); + +public: + /** + * @brief Get value of element using channel, row and column index + * @param[in] ch Channel index + * @param[in] row Row index + * @param[in] col Column index + * @return Value of element + */ + float at(uint32_t ch, uint32_t row, uint32_t col) const; + /** + * @brief Get reference of element using channel, row and column index + * @param[in] ch Channel index + * @param[in] row Row index + * @param[in] col Column index + * @return Reference of element + */ + float &at(uint32_t ch, uint32_t row, uint32_t col); + +private: + /** + * @brief Get offset of element from channel, row and column index + * @param[in] ch Channel index + * @param[in] row Row index + * @param[in] col Column index + * @return Offset of element + */ + uint32_t getElementOffset(uint32_t ch, uint32_t row, uint32_t col) const + { + uint32_t res = 0; + + // TensorFlow Lite assumes that NHWC ordering for tessor + res += row * _shape.W * _shape.C; + res += col * _shape.C; + res += ch; + + return res; + } + +private: + nnfw::misc::feature::Shape _shape; + float *_base; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_FEATURE_VIEW_H__ diff --git a/libs/tflite/include/tflite/InputIndex.h b/libs/tflite/include/tflite/InputIndex.h new file mode 100644 index 000000000..f535b2626 --- /dev/null +++ b/libs/tflite/include/tflite/InputIndex.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file InputIndex.h + * @brief This file contains InputIndex class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_INPUT_INDEX_H__ +#define __NNFW_TFLITE_INPUT_INDEX_H__ + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Class to express index of input + */ +class InputIndex +{ +public: + /** + * @brief Construct a new InputIndex object with index value + * @param [in] index The value of index + */ + InputIndex(int index) : _index(index) + { + // DO NOTHING + } + +public: + /** + * @brief Get index value as int + * @return Index value as int + */ + int asInt(void) const { return _index; } + +private: + int _index; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_INPUT_INDEX_H__ diff --git a/libs/tflite/include/tflite/InterpreterSession.h b/libs/tflite/include/tflite/InterpreterSession.h new file mode 100644 index 000000000..deaf05a7f --- /dev/null +++ b/libs/tflite/include/tflite/InterpreterSession.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file InterpreterSession.h + * @brief This file contains InterpreterSession class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_INTERPRETER_SESSION_H__ +#define __NNFW_TFLITE_INTERPRETER_SESSION_H__ + +#include "Session.h" + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Class to define TfLite interpreter session which is inherited from Session class + */ +class InterpreterSession final : public Session +{ +public: + /** + * @brief Construct a InterpreterSession object with interpreter of TfLite + * @param[in] interp The TfLite interpreter pointer + */ + InterpreterSession(::tflite::Interpreter *interp) : _interp{interp} + { + // DO NOTHING + } + +public: + /** + * @brief Get TfLite interpreter pointer + * @return The TfLite interpreter + */ + ::tflite::Interpreter *interp(void) override { return _interp; } + +public: + /** + * @brief Prepare the TfLite interpreter session + * @return @c true if tensor preparation is successful, otherwise @c false + */ + bool prepare(void) override + { + _interp->UseNNAPI(false); + + if (kTfLiteOk != _interp->AllocateTensors()) + { + return false; + } + + return true; + } + + /** + * @brief Run the Invoke function of TfLite interpreter + * @return @c true if Invoke() is successful, otherwise @c false + */ + bool run(void) override + { + // Return true if Invoke returns kTfLiteOk + return kTfLiteOk == _interp->Invoke(); + } + + /** + * @brief Tear down TfLite interpreter session + * @return @c true always + */ + bool teardown(void) override + { + // Do NOTHING currently + return true; + } + +private: + ::tflite::Interpreter *const _interp; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_INTERPRETER_SESSION_H__ diff --git a/libs/tflite/include/tflite/NNAPISession.h b/libs/tflite/include/tflite/NNAPISession.h new file mode 100644 index 000000000..b2a999d10 --- /dev/null +++ b/libs/tflite/include/tflite/NNAPISession.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file NNAPISession.h + * @brief This file contains NNAPISession class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_NNAPI_SESSION_H__ +#define __NNFW_TFLITE_NNAPI_SESSION_H__ + +#include "Session.h" +#include "tflite/ext/nnapi_delegate.h" + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Class to define NNAPI interpreter session which is inherited from Session class + */ +class NNAPISession final : public Session +{ +public: + /** + * @brief Construct a NNAPISession object with interpreter of TfLite + * @param[in] interp The TfLite interpreter pointer + * @note Invoke BuildGraph() of NNAPI delegate from Interpreter + */ + NNAPISession(::tflite::Interpreter *interp) : _interp{interp} + { + // Construct Graph from Interpreter + _delegate.BuildGraph(_interp); + } + +public: + /** + * @brief Get TfLite interpreter pointer + * @return The TfLite interpreter + */ + ::tflite::Interpreter *interp(void) override { return _interp; } + +public: + /** + * @brief Prepare the TfLite interpreter session + * @return @c true if tensor preparation is successful, otherwise @c false + */ + bool prepare(void) override + { + // Explicitly turn off T/F lite internal NNAPI delegation in order to use locally defined + // NNAPI delegation. + _interp->UseNNAPI(false); + + if (kTfLiteOk != _interp->AllocateTensors()) + { + return false; + } + + return true; + } + + /** + * @brief Run the Invoke function of NNAPI delegate + * @return @c true if Invoke() is successful, otherwise @c false + */ + bool run(void) override { return kTfLiteOk == _delegate.Invoke(_interp); } + + /** + * @brief Tear down TfLite interpreter session + * @return @c true always + */ + bool teardown(void) override + { + // DO NOTHING + return true; + } + +private: + ::tflite::Interpreter *const _interp; + nnfw::tflite::NNAPIDelegate _delegate; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_NNAPI_SESSION_H__ diff --git a/libs/tflite/include/tflite/OutputIndex.h b/libs/tflite/include/tflite/OutputIndex.h new file mode 100644 index 000000000..dd1ca8d44 --- /dev/null +++ b/libs/tflite/include/tflite/OutputIndex.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OutputIndex.h + * @brief This file contains OutputIndex class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_OUTPUT_INDEX_H__ +#define __NNFW_TFLITE_OUTPUT_INDEX_H__ + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Class to define OutputIndex + */ +class OutputIndex +{ +public: + /** + * @brief Construct a OutputIndex object with index value + * @param[in] index The value of index + */ + OutputIndex(int index) : _index(index) + { + // DO NOTHING + } + +public: + /** + * @brief Get index value as int + * @return Index valuel as int + */ + int asInt(void) const { return _index; } + +private: + int _index; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_OUTPUT_INDEX_H__ diff --git a/libs/tflite/include/tflite/Quantization.h b/libs/tflite/include/tflite/Quantization.h new file mode 100644 index 000000000..4a8a0f1ac --- /dev/null +++ b/libs/tflite/include/tflite/Quantization.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Quantization.h + * @brief This file contains BitwiseIntToFloat union and quantization related + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_QUANTIZATION_H__ +#define __NNFW_TFLITE_QUANTIZATION_H__ + +/** + * @brief Union to provide bitwise conversion of integer and float + */ +union BitwiseIntToFloat { + int i; + float f; +}; + +static const float FLOAT_NEAREST_TO_1 = BitwiseIntToFloat{0x3f7fffff}.f; + +#include "tensorflow/contrib/lite/context.h" + +/** + * @brief Get TfLiteQuantizationParams object with default values + * @return TfLiteQuantizationParams object + */ +TfLiteQuantizationParams make_default_quantization(void); + +#endif // __NNFW_TFLITE_QUANTIZATION_H__ diff --git a/libs/tflite/include/tflite/Session.h b/libs/tflite/include/tflite/Session.h new file mode 100644 index 000000000..4f2e5c54d --- /dev/null +++ b/libs/tflite/include/tflite/Session.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Session.h + * @brief This file contains Session class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_SESSION_H__ +#define __NNFW_TFLITE_SESSION_H__ + +#include <tensorflow/contrib/lite/interpreter.h> + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Structure to provide interface methods of interpreter session + */ +struct Session +{ + /** + * @brief Destruct Session object using default destructor + */ + virtual ~Session() = default; + + /** + * @brief Get the Interpreter object pointer + * @return The Interpreter object pointer + */ + virtual ::tflite::Interpreter *interp(void) = 0; + + /** + * @brief Prepare the session + * @return @c true if prepare method succeeded, otherwise @c false + */ + virtual bool prepare(void) = 0; + /** + * @brief Run the session + * @return @c true if run method succeeded, otherwise @c false + */ + virtual bool run(void) = 0; + /** + * @brief Teardown(release) the session + * @return @c true if teardown method succeeded, otherwise @c false + */ + virtual bool teardown(void) = 0; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_INTERP_SESSION_H__ diff --git a/libs/tflite/include/tflite/TensorLogger.h b/libs/tflite/include/tflite/TensorLogger.h new file mode 100644 index 000000000..e56a76b58 --- /dev/null +++ b/libs/tflite/include/tflite/TensorLogger.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file TensorLogger.h + * @brief This file contains TensorLogger class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_TENSOR_LOGGER_H__ +#define __NNFW_TFLITE_TENSOR_LOGGER_H__ + +#include "misc/tensor/IndexIterator.h" +#include "tflite/TensorView.h" + +#include <tensorflow/contrib/lite/interpreter.h> +#include <tensorflow/contrib/lite/context.h> +#include <fstream> +#include <iomanip> + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Class to write input and output value / shape into a file in python form + * @note This is a utility to write input and output value / shape into a file in python form.\n + * any python app can load this value by running the python code below:\n + * exec(open(filename).read())\n + * generated python code looks like the following: \n + * tensor_shape_gen = []\n + * tensor_value_gen = []\n\n + * tensor_shape_gen.append("{2, 1, 2}")\n + * tensor_value_gen.append([1, 2, 3, 4])\n\n + * tensor_shape_gen.append("{2}")\n + * tensor_value_gen.append([1, 2])\n\n + * tensor_shape_gen.append("{2, 1, 2}")\n + * tensor_value_gen.append([1, 4, 3, 8])\n + */ +class TensorLogger +{ +private: + std::ofstream _outfile; + +public: + /** + * @brief Get TensorLogger instance + * @return The TensorLogger instance + */ + static TensorLogger &instance() + { + static TensorLogger instance; + return instance; + } + + /** + * @brief Save the tensor details to file from interpreter + * @param[in] path The file path to save + * @param[in] interp The TfLite interpreter + */ + void save(const std::string &path, ::tflite::Interpreter &interp) + { + open(path); + + int log_index = 0; + for (const auto id : interp.inputs()) + { + _outfile << "# input tensors" << std::endl; + printTensor(interp, id, log_index++); + } + for (const auto id : interp.outputs()) + { + _outfile << "# output tensors" << std::endl; + printTensor(interp, id, log_index++); + } + close(); + } + +private: + void open(const std::string &path) + { + if (!_outfile.is_open()) + _outfile.open(path, std::ios_base::out); + + _outfile << "# ------ file: " << path << " ------" << std::endl + << "tensor_shape_gen = []" << std::endl + << "tensor_value_gen = []" << std::endl + << std::endl; + } + + void printTensor(::tflite::Interpreter &interp, const int id, const int log_index) + { + const TfLiteTensor *tensor = interp.tensor(id); + + _outfile << "# tensor name: " << tensor->name << std::endl; + _outfile << "# tflite::interpreter.tensor(" << id << ") -> " + "tensor_value_gen[" + << log_index << "]" << std::endl; + + if (tensor->type == kTfLiteInt32) + { + printTensorShape(tensor); + printTensorValue<int32_t>(tensor, tensor->data.i32); + } + else if (interp.tensor(id)->type == kTfLiteUInt8) + { + printTensorShape(tensor); + printTensorValue<uint8_t>(tensor, tensor->data.uint8); + } + else if (tensor->type == kTfLiteFloat32) + { + printTensorShape(tensor); + printTensorValue<float>(tensor, tensor->data.f); + } + } + + void printTensorShape(const TfLiteTensor *tensor) + { + _outfile << "tensor_shape_gen.append('{"; + + size_t r = 0; + for (; r < tensor->dims->size - 1; r++) + { + _outfile << tensor->dims->data[r] << ", "; + } + _outfile << tensor->dims->data[r]; + + _outfile << "}')" << std::endl; + } + + template <typename T> void printTensorValue(const TfLiteTensor *tensor, T *tensor_data_ptr) + { + _outfile << "tensor_value_gen.append(["; + + _outfile << std::fixed << std::setprecision(10); + + const T *end = reinterpret_cast<const T *>(tensor->data.raw_const + tensor->bytes); + for (T *ptr = tensor_data_ptr; ptr < end; ptr++) + _outfile << *ptr << ", "; + + _outfile << "])" << std::endl << std::endl; + } + + void close() + { + _outfile << "# --------- tensor shape and value defined above ---------" << std::endl; + _outfile.close(); + } +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_TENSOR_LOGGER_H__ diff --git a/libs/tflite/include/tflite/TensorShapeUtils.h b/libs/tflite/include/tflite/TensorShapeUtils.h new file mode 100644 index 000000000..ba8687413 --- /dev/null +++ b/libs/tflite/include/tflite/TensorShapeUtils.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file TensorShapeUtils.h + * @brief This file contains utilities function of tensor shape + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__ +#define __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__ + +#include "misc/tensor/Shape.h" + +#include <vector> + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Converts tensor::Shape into a vector + * @param[in] shape The tensor shape to be converted + * @return vector value of given shape object + */ +static inline std::vector<int32_t> as_dims(const nnfw::misc::tensor::Shape &shape) +{ + std::vector<int32_t> dims; + + for (uint32_t axis = 0; axis < shape.rank(); ++axis) + { + dims.emplace_back(shape.dim(axis)); + } + + return dims; +} + +/** + * @brief Broadcasts between two given shapes + * @param[in] lhs_shape The left hand side shape + * @param[in] rhs_shape The right hand side shape + * @return The broadcasted shape + */ +nnfw::misc::tensor::Shape broadcast(const nnfw::misc::tensor::Shape &lhs_shape, + const nnfw::misc::tensor::Shape &rhs_shape); + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__ diff --git a/libs/tflite/include/tflite/TensorUtils.h b/libs/tflite/include/tflite/TensorUtils.h new file mode 100644 index 000000000..6266c5dff --- /dev/null +++ b/libs/tflite/include/tflite/TensorUtils.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file TensorUtils.h + * @brief This file contains utilities function + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_TENSOR_UTILS_H__ +#define __NNFW_TFLITE_TENSOR_UTILS_H__ + +#include <tensorflow/contrib/lite/context.h> + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Get @c true if tensor type is kTfLiteFloat32, otherwise @c false + * @param[in] tensor The tensor object to be compared + * @return @c true if tensor type is kTfLiteFloat32, otherwise @c false + */ +inline bool isFloatTensor(const TfLiteTensor *tensor) { return tensor->type == kTfLiteFloat32; } + +/** + * @brief Get @c true if tensor is 4-D tensor and the first dimension length is 1, + * otherwise @c false + * @param[in] tensor The tensor object to be compared + * @return @c true if tensor is 4-D tensor and the first dimension length is 1, otherwise @c false + */ +inline bool isFeatureTensor(const TfLiteTensor *tensor) +{ + return (tensor->dims->size == 4) && (tensor->dims->data[0] == 1); +} + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_TENSOR_UTILS_H__ diff --git a/libs/tflite/include/tflite/TensorView.h b/libs/tflite/include/tflite/TensorView.h new file mode 100644 index 000000000..79c754c78 --- /dev/null +++ b/libs/tflite/include/tflite/TensorView.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file TensorView.h + * @brief This file contains TensorView class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_TENSOR_VIEW_H__ +#define __NNFW_TFLITE_TENSOR_VIEW_H__ + +#include "tensorflow/contrib/lite/interpreter.h" + +#include "misc/tensor/Shape.h" +#include "misc/tensor/Index.h" +#include "misc/tensor/Reader.h" +#include "misc/tensor/NonIncreasingStride.h" + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Class to define TensorView which is inherited from nnfw::misc::tensor::Reader<T> class + */ +template <typename T> class TensorView final : public nnfw::misc::tensor::Reader<T> +{ +public: + /** + * @brief Construct a TensorView object with base and shape informations + * @param[in] shape The shape of a tensor + * @param[in] base The base address of a tensor + */ + TensorView(const nnfw::misc::tensor::Shape &shape, T *base) : _shape{shape}, _base{base} + { + // Set 'stride' + _stride.init(_shape); + } + +public: + /** + * @brief Get shape of tensor + * @return Reference of shape + */ + const nnfw::misc::tensor::Shape &shape(void) const { return _shape; } + +public: + /** + * @brief Get value of tensor index + * @param[in] index The tensor index + * @return The value at the index + */ + T at(const nnfw::misc::tensor::Index &index) const override + { + const auto offset = _stride.offset(index); + return *(_base + offset); + } + +public: + /** + * @brief Get reference value of tensor index + * @param[in] index The tensor index + * @return The reference value at the index + */ + T &at(const nnfw::misc::tensor::Index &index) + { + const auto offset = _stride.offset(index); + return *(_base + offset); + } + +private: + nnfw::misc::tensor::Shape _shape; /**< The tensor shape */ + +public: + T *_base; /**< The base address of tensor */ + nnfw::misc::tensor::NonIncreasingStride _stride; /**< The NonIncreasingStride object */ + +public: + // TODO Introduce Operand ID class + /** + * @brief Create TensorView object using given parameters + * @param[in] interp The TfLite interpreter + * @param[in] tensor_index The tensor index + * @return The new TensorView<T> object + */ + static TensorView<T> make(::tflite::Interpreter &interp, int tensor_index) + { + auto tensor_ptr = interp.tensor(tensor_index); + + // Set 'shape' + nnfw::misc::tensor::Shape shape(tensor_ptr->dims->size); + + for (uint32_t axis = 0; axis < shape.rank(); ++axis) + { + shape.dim(axis) = tensor_ptr->dims->data[axis]; + } + + return TensorView<T>(shape, interp.typed_tensor<T>(tensor_index)); + } +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_TENSOR_VIEW_H__ diff --git a/libs/support/nnapi/src/feature/Utils.cpp b/libs/tflite/include/tflite/ext/kernels/Abs.h index 62939ff4a..74e4aa658 100644 --- a/libs/support/nnapi/src/feature/Utils.cpp +++ b/libs/tflite/include/tflite/ext/kernels/Abs.h @@ -14,30 +14,28 @@ * limitations under the License. */ -#include "support/nnapi/feature/Utils.h" +#ifndef __NNFW_TFLITE_EXT_KERNELS_ABS_H__ +#define __NNFW_TFLITE_EXT_KERNELS_ABS_H__ + +#include "tensorflow/contrib/lite/context.h" namespace nnfw { -namespace support -{ -namespace nnapi +namespace tflite { -namespace feature +namespace custom { - -uint32_t indexOf(const nnfw::util::feature::Shape &shape, uint32_t ch, uint32_t row, uint32_t col) +namespace Abs { - uint32_t res = 0; - // NNAPI assumes that NHWC ordering for feature map - res += row * shape.W * shape.C; - res += col * shape.C; - res += ch; +void *InitAbs(TfLiteContext *context, const char *buffer, size_t length); +void FreeAbs(TfLiteContext *context, void *buffer); +TfLiteStatus PrepareAbs(TfLiteContext *context, TfLiteNode *node); +TfLiteStatus EvalAbs(TfLiteContext *context, TfLiteNode *node); - return res; -} - -} // namespace feature -} // namespace nnapi -} // namespace support +} // namespace Abs +} // namespace custom +} // namespace tflite } // namespace nnfw + +#endif // __NNFW_TFLITE_EXT_KERNELS_ABS_H__ diff --git a/libs/tflite/include/tflite/ext/kernels/CustomOps.h b/libs/tflite/include/tflite/ext/kernels/CustomOps.h new file mode 100644 index 000000000..3f9459bb2 --- /dev/null +++ b/libs/tflite/include/tflite/ext/kernels/CustomOps.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CustomOps.h + * @brief This file contains registration of custom operands + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_EXT_KERNELS_CUSTOM_OP_H__ +#define __NNFW_TFLITE_EXT_KERNELS_CUSTOM_OP_H__ + +#include "tensorflow/contrib/lite/context.h" +#include "tflite/ext/kernels/TensorFlowMax.h" +#include "tflite/ext/kernels/SquaredDifference.h" +#include "tflite/ext/kernels/TensorFlowSum.h" +#include "tflite/ext/kernels/Abs.h" + +namespace nnfw +{ +namespace tflite +{ +namespace custom +{ + +#define REGISTER_FUNCTION(Name) \ + TfLiteRegistration *Register_##Name(void) \ + { \ + static TfLiteRegistration r = { \ + Name::Init##Name, Name::Free##Name, Name::Prepare##Name, Name::Eval##Name, \ + }; \ + r.custom_name = #Name; \ + return &r; \ + } + +REGISTER_FUNCTION(TensorFlowMax) +REGISTER_FUNCTION(SquaredDifference) +REGISTER_FUNCTION(TensorFlowSum) +REGISTER_FUNCTION(Abs) + +#undef REGISTER_FUNCTION + +} // namespace custom +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_EXT_KERNELS_CUSTOM_OP_H__ diff --git a/libs/tflite/include/tflite/ext/kernels/SquaredDifference.h b/libs/tflite/include/tflite/ext/kernels/SquaredDifference.h new file mode 100644 index 000000000..492523c02 --- /dev/null +++ b/libs/tflite/include/tflite/ext/kernels/SquaredDifference.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file SquaredDifference.h + * @brief This file contains SquaredDifference namespace and SquaredDifference function + * definitions + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_EXT_KERNELS_SQUARED_DIFFERENCE_H__ +#define __NNFW_TFLITE_EXT_KERNELS_SQUARED_DIFFERENCE_H__ + +#include "tensorflow/contrib/lite/context.h" + +namespace nnfw +{ +namespace tflite +{ +namespace custom +{ +namespace SquaredDifference +{ + +/** + * @brief Initialize SquaredDifference operand using the contents of buffer + * @param[in] context The TfLite context + * @param[in] buffer The buffer with contents + * @param[in] length The buffer length + * @return The void pointer for user data + */ +void *InitSquaredDifference(TfLiteContext *context, const char *buffer, size_t length); + +/** + * @brief Release any memory it might have allocated via 'InitSquaredDifference' + * @param[in] context The TfLite context + * @param[in] buffer The buffer with contents + * @return N/A + */ +void FreeSquaredDifference(TfLiteContext *context, void *buffer); + +/** + * @brief Prepare the SquaredDifference operand for execution + * @param[in] context The TfLite context + * @param[in] node The operand node + * @return The TfLite status + */ +TfLiteStatus PrepareSquaredDifference(TfLiteContext *context, TfLiteNode *node); + +/** + * @brief Evaluation the SquaredDifference operand for execution + * @param[in] context The TfLite context + * @param[in] node The operand node + * @return The TfLite status + */ +TfLiteStatus EvalSquaredDifference(TfLiteContext *context, TfLiteNode *node); + +} // namespace SquaredDifference +} // namespace custom +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_EXT_KERNELS_SQUARED_DIFFERENCE_H__ diff --git a/libs/tflite/include/tflite/ext/kernels/TensorFlowMax.h b/libs/tflite/include/tflite/ext/kernels/TensorFlowMax.h new file mode 100644 index 000000000..d31d76483 --- /dev/null +++ b/libs/tflite/include/tflite/ext/kernels/TensorFlowMax.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file TensorFlowMax.h + * @brief This file contains TensorFlowMax namespace and TensorFlowMax function definitions + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_MAX_H__ +#define __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_MAX_H__ + +#include "tensorflow/contrib/lite/context.h" + +namespace nnfw +{ +namespace tflite +{ +namespace custom +{ +namespace TensorFlowMax +{ + +/** + * @brief Initialize TensorFlowMax operand using the contents of buffer + * @param[in] context The TfLite context + * @param[in] buffer The buffer with contents + * @param[in] length The buffer length + * @return The void pointer for user data + */ +void *InitTensorFlowMax(TfLiteContext *context, const char *buffer, size_t length); + +/** + * @brief Release any memory it might have allocated via 'InitTensorFlowMax' + * @param[in] context The TfLite context + * @param[in] buffer The buffer with contents + * @return N/A + */ +void FreeTensorFlowMax(TfLiteContext *context, void *buffer); + +/** + * @brief Prepare the TensorFlowMax operand for execution + * @param[in] context The TfLite context + * @param[in] node The operand node + * @return The TfLite status + */ +TfLiteStatus PrepareTensorFlowMax(TfLiteContext *context, TfLiteNode *node); + +/** + * @brief Evaluation the TensorFlowMax operand for execution + * @param[in] context The TfLite context + * @param[in] node The operand node + * @return The TfLite status + */ +TfLiteStatus EvalTensorFlowMax(TfLiteContext *context, TfLiteNode *node); + +} // namespace TensorFlowMax +} // namespace custom +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_MAX_H__ diff --git a/libs/tflite/include/tflite/ext/kernels/TensorFlowSum.h b/libs/tflite/include/tflite/ext/kernels/TensorFlowSum.h new file mode 100644 index 000000000..66783cf41 --- /dev/null +++ b/libs/tflite/include/tflite/ext/kernels/TensorFlowSum.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_SUM_H__ +#define __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_SUM_H__ + +#include "tensorflow/contrib/lite/context.h" + +namespace nnfw +{ +namespace tflite +{ +namespace custom +{ +namespace TensorFlowSum +{ + +void *InitTensorFlowSum(TfLiteContext *context, const char *buffer, size_t length); +void FreeTensorFlowSum(TfLiteContext *context, void *buffer); +TfLiteStatus PrepareTensorFlowSum(TfLiteContext *context, TfLiteNode *node); +TfLiteStatus EvalTensorFlowSum(TfLiteContext *context, TfLiteNode *node); + +} // namespace TensorFlowSum +} // namespace custom +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_SUM_H__ diff --git a/libs/tflite/include/tflite/ext/kernels/register.h b/libs/tflite/include/tflite/ext/kernels/register.h new file mode 100644 index 000000000..124af7abc --- /dev/null +++ b/libs/tflite/include/tflite/ext/kernels/register.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + +// NOTE This header is derived from the following file (in TensorFlow) +// 'externals/tensorflow/tensorflow/contrib/lite/kernels/register.h' +#ifndef __NNFW_TFLITE_EXT_KERNELS_REGISTER_H__ +#define __NNFW_TFLITE_EXT_KERNELS_REGISTER_H__ + +#include <unordered_map> +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/model.h" + +namespace nnfw { +namespace tflite { + +class BuiltinOpResolver : public ::tflite::MutableOpResolver { + public: + BuiltinOpResolver(); +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_EXT_KERNELS_REGISTER_H__ + +// clang-format on diff --git a/libs/tflite/include/tflite/ext/nnapi_delegate.h b/libs/tflite/include/tflite/ext/nnapi_delegate.h new file mode 100644 index 000000000..3aac01af7 --- /dev/null +++ b/libs/tflite/include/tflite/ext/nnapi_delegate.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + +// NOTE This header is derived from the following file (in TensorFlow v1.12) +// 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.h' +#ifndef __NNFW_TFLITE_EXT_NNAPI_DELEGATE_H__ +#define __NNFW_TFLITE_EXT_NNAPI_DELEGATE_H__ + +#include "tensorflow/contrib/lite/allocation.h" +#ifdef OBS_BUILD +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/error_reporter.h" +#else +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/core/api/error_reporter.h" +#endif +#include "tensorflow/contrib/lite/interpreter.h" +#include "NeuralNetworksShim.h" + +class ANeuralNetworksModel; +class ANeuralNetworksMemory; +class ANeuralNetworksCompilation; + +namespace nnfw { +namespace tflite { + +class NNAPIAllocation : public ::tflite::MMAPAllocation { + public: + NNAPIAllocation(const char* filename, ::tflite::ErrorReporter* error_reporter); + ~NNAPIAllocation(); + + size_t offset(const void* ptr) const { + auto signed_offset = reinterpret_cast<const uint8_t*>(ptr) - + reinterpret_cast<const uint8_t*>(mmapped_buffer_); + + return static_cast<size_t>(signed_offset); + } + + ANeuralNetworksMemory* memory() const { return handle_; } + bool valid() const override { return handle_ != nullptr; } + + private: + mutable ANeuralNetworksMemory* handle_ = nullptr; +}; + +class NNAPIDelegate { + public: + ~NNAPIDelegate(); + + // Convert a tflite graph to NNAPI + TfLiteStatus BuildGraph(::tflite::Interpreter* interpreter); + + // Run + TfLiteStatus Invoke(::tflite::Interpreter* interpreter); + + // Whether the current platform supports NNAPI delegation. + static bool IsSupported(); + + private: + // The NN API model handle + ANeuralNetworksModel* nn_model_ = nullptr; + // The NN API compilation handle + ANeuralNetworksCompilation* nn_compiled_model_ = nullptr; + // Model status + TfLiteStatus model_status_ = kTfLiteOk; + + // List of state tensors for LSTM, RNN, SVDF. + // NN API does not allow ops to maintain states across multiple + // invocations. We need to manually create state input tensors from + // corresponding state output tensors of TFLite operations, and map them + // correctly. + std::vector<int> model_states_inputs_; // holds NNAPI operand ids + std::vector<int> model_states_outputs_; // holds TFLite tensor ids +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_EXT_NNAPI_DELEGATE_H__ + +// clang-format on diff --git a/libs/tflite/include/tflite/interp/Builder.h b/libs/tflite/include/tflite/interp/Builder.h new file mode 100644 index 000000000..b4d082419 --- /dev/null +++ b/libs/tflite/include/tflite/interp/Builder.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Builder.h + * @brief This file contains Builder structure + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_INTERP_BUILDER_H__ +#define __NNFW_TFLITE_INTERP_BUILDER_H__ + +#include <tensorflow/contrib/lite/interpreter.h> + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Structure to Builder + */ +struct Builder +{ + /** + * @brief Destroy the Builder object + */ + virtual ~Builder() = default; + + /** + * @brief Build a FlatBuffer model + * @return The TfLite interpreter object + */ + virtual std::unique_ptr<::tflite::Interpreter> build(void) const = 0; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_INTERP_BUILDER_H__ diff --git a/libs/tflite/include/tflite/interp/FlatBufferBuilder.h b/libs/tflite/include/tflite/interp/FlatBufferBuilder.h new file mode 100644 index 000000000..13470b8c5 --- /dev/null +++ b/libs/tflite/include/tflite/interp/FlatBufferBuilder.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file FlatBufferBuilder.h + * @brief This file contains FlatBufferBuilder class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_INTERP_FLAT_BUFFER_BUILDER_H__ +#define __NNFW_TFLITE_INTERP_FLAT_BUFFER_BUILDER_H__ + +#include <tensorflow/contrib/lite/model.h> + +#include "tflite/interp/Builder.h" + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Class to define FlatBufferBuilder which is inherited from Builder + */ +class FlatBufferBuilder final : public Builder +{ +public: + /** + * @brief Construct a FlatBufferBuilder object with FlatBufferModel of TfLite + * @param[in] model The TfLite Flatbuffer model + */ + FlatBufferBuilder(const ::tflite::FlatBufferModel &model) : _model{model} + { + // DO NOTHING + } + +public: + /** + * @brief Build a FlatBuffer model + * @return The TfLite interpreter pointer address + */ + std::unique_ptr<::tflite::Interpreter> build(void) const override; + +private: + const ::tflite::FlatBufferModel &_model; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_INTERP_FLAT_BUFFER_BUILDER_H__ diff --git a/libs/tflite/include/tflite/interp/FunctionBuilder.h b/libs/tflite/include/tflite/interp/FunctionBuilder.h new file mode 100644 index 000000000..064375939 --- /dev/null +++ b/libs/tflite/include/tflite/interp/FunctionBuilder.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file FunctionBuilder.h + * @brief This file contains FunctionBuilder class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_TFLITE_INTERP_FUNCTION_BUILDER_H__ +#define __NNFW_TFLITE_INTERP_FUNCTION_BUILDER_H__ + +#include <tensorflow/contrib/lite/model.h> + +#include "tflite/interp/Builder.h" + +namespace nnfw +{ +namespace tflite +{ + +/** + * @brief Class to define FunctionBuilder which is inherited from Builder + */ +class FunctionBuilder final : public Builder +{ +public: + using SetupFunc = std::function<void(::tflite::Interpreter &)>; + +public: + /** + * @brief Construct a FunctionBuilder object with SetupFunction + * @param[in] fn The SetupFunc object + */ + FunctionBuilder(const SetupFunc &fn) : _fn{fn} + { + // DO NOTHING + } + +public: + /** + * @brief Build a SetupFunc + * @return The TfLite interpreter pointer address + */ + std::unique_ptr<::tflite::Interpreter> build(void) const override; + +private: + SetupFunc _fn; +}; + +} // namespace tflite +} // namespace nnfw + +#endif // __NNFW_TFLITE_INTERP_FUNCTION_BUILDER_H__ diff --git a/libs/support/tflite/src/Diff.cpp b/libs/tflite/src/Diff.cpp index e875571cb..45ef06110 100644 --- a/libs/support/tflite/src/Diff.cpp +++ b/libs/tflite/src/Diff.cpp @@ -14,22 +14,22 @@ * limitations under the License. */ -#include "support/tflite/Diff.h" -#include "support/tflite/nnapi_delegate.h" +#include "tflite/Diff.h" +#include "tflite/ext/nnapi_delegate.h" -#include "util/fp32.h" +#include "misc/fp32.h" -#include "util/tensor/IndexIterator.h" -#include "util/tensor/IndexFormatter.h" -#include "util/tensor/Zipper.h" -#include "util/tensor/Comparator.h" +#include "misc/tensor/IndexIterator.h" +#include "misc/tensor/IndexFormatter.h" +#include "misc/tensor/Zipper.h" +#include "misc/tensor/Comparator.h" -#include "util/environment.h" +#include "misc/environment.h" #include <iostream> #include <cassert> -class DiffSummary : public nnfw::util::tensor::Comparator::Observer +class DiffSummary : public nnfw::misc::tensor::Comparator::Observer { public: DiffSummary() @@ -41,21 +41,21 @@ public: } public: - void notify(const nnfw::util::tensor::Index &index, float expected, float obtained) override; + void notify(const nnfw::misc::tensor::Index &index, float expected, float obtained) override; public: - nnfw::util::tensor::Index max_abs_diff_index; + nnfw::misc::tensor::Index max_abs_diff_index; float max_abs_diff_expected; float max_abs_diff_obtained; float max_abs_diff_value; - nnfw::util::tensor::Index max_rel_diff_index; + nnfw::misc::tensor::Index max_rel_diff_index; float max_rel_diff_expected; float max_rel_diff_obtained; float max_rel_diff_value; }; -void DiffSummary::notify(const nnfw::util::tensor::Index &index, float expected, float obtained) +void DiffSummary::notify(const nnfw::misc::tensor::Index &index, float expected, float obtained) { const auto abs_diff_value = std::fabs(expected - obtained); @@ -67,7 +67,7 @@ void DiffSummary::notify(const nnfw::util::tensor::Index &index, float expected, max_abs_diff_obtained = obtained; } - const auto rel_diff_value = nnfw::util::fp32::relative_diff(expected, obtained); + const auto rel_diff_value = nnfw::misc::fp32::relative_diff(expected, obtained); if (max_rel_diff_value < rel_diff_value) { @@ -79,15 +79,15 @@ void DiffSummary::notify(const nnfw::util::tensor::Index &index, float expected, } template <typename T> -bool TfLiteInterpMatchApp::compareSingleTensorView( - const nnfw::support::tflite::TensorView<T> &expected, - const nnfw::support::tflite::TensorView<T> &obtained, int id) const +bool TfLiteInterpMatchApp::compareSingleTensorView(const nnfw::tflite::TensorView<T> &expected, + const nnfw::tflite::TensorView<T> &obtained, + int id) const { - std::vector<nnfw::util::tensor::Diff<T>> diffs; + std::vector<nnfw::misc::tensor::Diff<T>> diffs; assert(expected.shape() == obtained.shape()); - using nnfw::util::tensor::zip; - using nnfw::util::tensor::Index; + using nnfw::misc::tensor::zip; + using nnfw::misc::tensor::Index; zip(expected.shape(), expected, obtained) << [&](const Index &index, T expected_value, T obtained_value) { @@ -113,7 +113,7 @@ bool TfLiteInterpMatchApp::compareSingleTensorView( std::cout << " ---- Details ---" << std::endl; for (const auto &diff : diffs) { - std::cout << " Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]" + std::cout << " Diff at [" << nnfw::misc::tensor::IndexFormatter(diff.index) << "]" << std::endl; std::cout << " expected: " << diff.expected << std::endl; std::cout << " obtained: " << diff.obtained << std::endl; @@ -125,8 +125,8 @@ bool TfLiteInterpMatchApp::compareSingleTensorView( template <> bool TfLiteInterpMatchApp::compareSingleTensorView<float>( - const nnfw::support::tflite::TensorView<float> &expected, - const nnfw::support::tflite::TensorView<float> &obtained, int id) const + const nnfw::tflite::TensorView<float> &expected, + const nnfw::tflite::TensorView<float> &obtained, int id) const { DiffSummary summary; @@ -148,7 +148,7 @@ bool TfLiteInterpMatchApp::compareSingleTensorView<float>( if (summary.max_abs_diff_value > 0) { std::cout << " Max absolute diff at [" - << nnfw::util::tensor::IndexFormatter(summary.max_abs_diff_index) << "]" << std::endl; + << nnfw::misc::tensor::IndexFormatter(summary.max_abs_diff_index) << "]" << std::endl; std::cout << " expected: " << summary.max_abs_diff_expected << std::endl; std::cout << " obtained: " << summary.max_abs_diff_obtained << std::endl; std::cout << " absolute diff: " << summary.max_abs_diff_value << std::endl; @@ -159,7 +159,7 @@ bool TfLiteInterpMatchApp::compareSingleTensorView<float>( const auto tolerance_level = summary.max_rel_diff_value / FLT_EPSILON; std::cout << " Max relative diff at [" - << nnfw::util::tensor::IndexFormatter(summary.max_rel_diff_index) << "]" << std::endl; + << nnfw::misc::tensor::IndexFormatter(summary.max_rel_diff_index) << "]" << std::endl; std::cout << " expected: " << summary.max_rel_diff_expected << std::endl; std::cout << " obtained: " << summary.max_rel_diff_obtained << std::endl; std::cout << " relative diff: " << summary.max_rel_diff_value << std::endl; @@ -174,10 +174,10 @@ bool TfLiteInterpMatchApp::compareSingleTensorView<float>( for (const auto &diff : diffs) { const auto absolute_diff = std::fabs(diff.expected - diff.obtained); - const auto relative_diff = nnfw::util::fp32::relative_diff(diff.expected, diff.obtained); + const auto relative_diff = nnfw::misc::fp32::relative_diff(diff.expected, diff.obtained); const auto tolerance_level = relative_diff / FLT_EPSILON; - std::cout << " Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]" + std::cout << " Diff at [" << nnfw::misc::tensor::IndexFormatter(diff.index) << "]" << std::endl; std::cout << " expected: " << diff.expected << std::endl; std::cout << " obtained: " << diff.obtained << std::endl; @@ -206,24 +206,32 @@ bool TfLiteInterpMatchApp::run(::tflite::Interpreter &interp, ::tflite::Interpre comparators[kTfLiteUInt8] = [this](int id, ::tflite::Interpreter &interp, ::tflite::Interpreter &nnapi) { - const auto expected = nnfw::support::tflite::TensorView<uint8_t>::make(interp, id); - const auto obtained = nnfw::support::tflite::TensorView<uint8_t>::make(nnapi, id); + const auto expected = nnfw::tflite::TensorView<uint8_t>::make(interp, id); + const auto obtained = nnfw::tflite::TensorView<uint8_t>::make(nnapi, id); return compareSingleTensorView(expected, obtained, id); }; comparators[kTfLiteInt32] = [this](int id, ::tflite::Interpreter &interp, ::tflite::Interpreter &nnapi) { - const auto expected = nnfw::support::tflite::TensorView<int32_t>::make(interp, id); - const auto obtained = nnfw::support::tflite::TensorView<int32_t>::make(nnapi, id); + const auto expected = nnfw::tflite::TensorView<int32_t>::make(interp, id); + const auto obtained = nnfw::tflite::TensorView<int32_t>::make(nnapi, id); return compareSingleTensorView(expected, obtained, id); }; comparators[kTfLiteFloat32] = [this](int id, ::tflite::Interpreter &interp, ::tflite::Interpreter &nnapi) { - const auto expected = nnfw::support::tflite::TensorView<float>::make(interp, id); - const auto obtained = nnfw::support::tflite::TensorView<float>::make(nnapi, id); + const auto expected = nnfw::tflite::TensorView<float>::make(interp, id); + const auto obtained = nnfw::tflite::TensorView<float>::make(nnapi, id); + + return compareSingleTensorView(expected, obtained, id); + }; + + comparators[kTfLiteBool] = [this](int id, ::tflite::Interpreter &interp, + ::tflite::Interpreter &nnapi) { + const auto expected = nnfw::tflite::TensorView<bool>::make(interp, id); + const auto obtained = nnfw::tflite::TensorView<bool>::make(nnapi, id); return compareSingleTensorView(expected, obtained, id); }; @@ -250,7 +258,7 @@ bool TfLiteInterpMatchApp::run(::tflite::Interpreter &interp, ::tflite::Interpre return all_matched; } -#include "util/tensor/Object.h" +#include "misc/tensor/Object.h" using namespace std::placeholders; @@ -265,11 +273,11 @@ template <> uint8_t RandomGenerator::generate<uint8_t>(void) return static_cast<uint8_t>((_dist(_rand) - min_range) * type_range / (max_range - min_range)); } -#include "support/tflite/TensorLogger.h" +#include "tflite/TensorLogger.h" // // Random Test Runner // -int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) +int RandomTestRunner::run(const nnfw::tflite::Builder &builder) { auto tfl_interp = builder.build(); auto nnapi = builder.build(); @@ -293,15 +301,15 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) assert(tfl_interp->tensor(id)->type == kTfLiteInt32); assert(nnapi->tensor(id)->type == kTfLiteInt32); - auto tfl_interp_view = nnfw::support::tflite::TensorView<int32_t>::make(*tfl_interp, id); - auto nnapi_view = nnfw::support::tflite::TensorView<int32_t>::make(*nnapi, id); + auto tfl_interp_view = nnfw::tflite::TensorView<int32_t>::make(*tfl_interp, id); + auto nnapi_view = nnfw::tflite::TensorView<int32_t>::make(*nnapi, id); assert(tfl_interp_view.shape() == nnapi_view.shape()); int32_t value = 0; - nnfw::util::tensor::iterate(tfl_interp_view.shape()) - << [&](const nnfw::util::tensor::Index &ind) { + nnfw::misc::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::misc::tensor::Index &ind) { // TODO Generate random values tfl_interp_view.at(ind) = value; nnapi_view.at(ind) = value; @@ -314,15 +322,15 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) assert(tfl_interp->tensor(id)->type == kTfLiteInt32); assert(nnapi->tensor(id)->type == kTfLiteInt32); - auto tfl_interp_view = nnfw::support::tflite::TensorView<int32_t>::make(*tfl_interp, id); - auto nnapi_view = nnfw::support::tflite::TensorView<int32_t>::make(*nnapi, id); + auto tfl_interp_view = nnfw::tflite::TensorView<int32_t>::make(*tfl_interp, id); + auto nnapi_view = nnfw::tflite::TensorView<int32_t>::make(*nnapi, id); assert(tfl_interp_view.shape() == nnapi_view.shape()); int32_t value = 0; - nnfw::util::tensor::iterate(tfl_interp_view.shape()) - << [&](const nnfw::util::tensor::Index &ind) { + nnfw::misc::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::misc::tensor::Index &ind) { // TODO Generate random values tfl_interp_view.at(ind) = value; nnapi_view.at(ind) = value; @@ -333,20 +341,20 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) assert(tfl_interp->tensor(id)->type == kTfLiteUInt8); assert(nnapi->tensor(id)->type == kTfLiteUInt8); - auto tfl_interp_view = nnfw::support::tflite::TensorView<uint8_t>::make(*tfl_interp, id); - auto nnapi_view = nnfw::support::tflite::TensorView<uint8_t>::make(*nnapi, id); + auto tfl_interp_view = nnfw::tflite::TensorView<uint8_t>::make(*tfl_interp, id); + auto nnapi_view = nnfw::tflite::TensorView<uint8_t>::make(*nnapi, id); assert(tfl_interp_view.shape() == nnapi_view.shape()); - auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &, - const ::nnfw::util::tensor::Index &)>( + auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &, + const ::nnfw::misc::tensor::Index &)>( &RandomGenerator::generate<uint8_t>); - const nnfw::util::tensor::Object<uint8_t> data(tfl_interp_view.shape(), + const nnfw::misc::tensor::Object<uint8_t> data(tfl_interp_view.shape(), std::bind(fp, _randgen, _1, _2)); assert(tfl_interp_view.shape() == data.shape()); - nnfw::util::tensor::iterate(tfl_interp_view.shape()) - << [&](const nnfw::util::tensor::Index &ind) { + nnfw::misc::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::misc::tensor::Index &ind) { const auto value = data.at(ind); tfl_interp_view.at(ind) = value; @@ -358,22 +366,22 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) assert(tfl_interp->tensor(id)->type == kTfLiteUInt8); assert(nnapi->tensor(id)->type == kTfLiteUInt8); - auto tfl_interp_view = nnfw::support::tflite::TensorView<uint8_t>::make(*tfl_interp, id); - auto nnapi_view = nnfw::support::tflite::TensorView<uint8_t>::make(*nnapi, id); + auto tfl_interp_view = nnfw::tflite::TensorView<uint8_t>::make(*tfl_interp, id); + auto nnapi_view = nnfw::tflite::TensorView<uint8_t>::make(*nnapi, id); assert(tfl_interp_view.shape() == nnapi_view.shape()); - auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &, - const ::nnfw::util::tensor::Index &)>( + auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &, + const ::nnfw::misc::tensor::Index &)>( &RandomGenerator::generate<uint8_t>); - const nnfw::util::tensor::Object<uint8_t> data(tfl_interp_view.shape(), + const nnfw::misc::tensor::Object<uint8_t> data(tfl_interp_view.shape(), std::bind(fp, _randgen, _1, _2)); assert(tfl_interp_view.shape() == data.shape()); uint8_t value = 0; - nnfw::util::tensor::iterate(tfl_interp_view.shape()) - << [&](const nnfw::util::tensor::Index &ind) { + nnfw::misc::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::misc::tensor::Index &ind) { tfl_interp_view.at(ind) = value; nnapi_view.at(ind) = value; }; @@ -383,21 +391,21 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) assert(tfl_interp->tensor(id)->type == kTfLiteFloat32); assert(nnapi->tensor(id)->type == kTfLiteFloat32); - auto tfl_interp_view = nnfw::support::tflite::TensorView<float>::make(*tfl_interp, id); - auto nnapi_view = nnfw::support::tflite::TensorView<float>::make(*nnapi, id); + auto tfl_interp_view = nnfw::tflite::TensorView<float>::make(*tfl_interp, id); + auto nnapi_view = nnfw::tflite::TensorView<float>::make(*nnapi, id); assert(tfl_interp_view.shape() == nnapi_view.shape()); - auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &, - const ::nnfw::util::tensor::Index &)>( + auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &, + const ::nnfw::misc::tensor::Index &)>( &RandomGenerator::generate<float>); - const nnfw::util::tensor::Object<float> data(tfl_interp_view.shape(), + const nnfw::misc::tensor::Object<float> data(tfl_interp_view.shape(), std::bind(fp, _randgen, _1, _2)); assert(tfl_interp_view.shape() == data.shape()); - nnfw::util::tensor::iterate(tfl_interp_view.shape()) - << [&](const nnfw::util::tensor::Index &ind) { + nnfw::misc::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::misc::tensor::Index &ind) { const auto value = data.at(ind); tfl_interp_view.at(ind) = value; @@ -409,23 +417,75 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) assert(tfl_interp->tensor(id)->type == kTfLiteFloat32); assert(nnapi->tensor(id)->type == kTfLiteFloat32); - auto tfl_interp_view = nnfw::support::tflite::TensorView<float>::make(*tfl_interp, id); - auto nnapi_view = nnfw::support::tflite::TensorView<float>::make(*nnapi, id); + auto tfl_interp_view = nnfw::tflite::TensorView<float>::make(*tfl_interp, id); + auto nnapi_view = nnfw::tflite::TensorView<float>::make(*nnapi, id); assert(tfl_interp_view.shape() == nnapi_view.shape()); - auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &, - const ::nnfw::util::tensor::Index &)>( + auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &, + const ::nnfw::misc::tensor::Index &)>( &RandomGenerator::generate<float>); - const nnfw::util::tensor::Object<float> data(tfl_interp_view.shape(), + const nnfw::misc::tensor::Object<float> data(tfl_interp_view.shape(), std::bind(fp, _randgen, _1, _2)); assert(tfl_interp_view.shape() == data.shape()); float value = 0; - nnfw::util::tensor::iterate(tfl_interp_view.shape()) - << [&](const nnfw::util::tensor::Index &ind) { + nnfw::misc::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::misc::tensor::Index &ind) { + tfl_interp_view.at(ind) = value; + nnapi_view.at(ind) = value; + }; + }; + + initializers[kTfLiteBool] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) { + assert(tfl_interp->tensor(id)->type == kTfLiteBool); + assert(nnapi->tensor(id)->type == kTfLiteBool); + + auto tfl_interp_view = nnfw::tflite::TensorView<bool>::make(*tfl_interp, id); + auto nnapi_view = nnfw::tflite::TensorView<bool>::make(*nnapi, id); + + assert(tfl_interp_view.shape() == nnapi_view.shape()); + + auto fp = static_cast<bool (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &, + const ::nnfw::misc::tensor::Index &)>( + &RandomGenerator::generate<bool>); + const nnfw::misc::tensor::Object<bool> data(tfl_interp_view.shape(), + std::bind(fp, _randgen, _1, _2)); + + assert(tfl_interp_view.shape() == data.shape()); + + nnfw::misc::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::misc::tensor::Index &ind) { + const auto value = data.at(ind); + + tfl_interp_view.at(ind) = value; + nnapi_view.at(ind) = value; + }; + }; + + reseters[kTfLiteBool] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) { + assert(tfl_interp->tensor(id)->type == kTfLiteBool); + assert(nnapi->tensor(id)->type == kTfLiteBool); + + auto tfl_interp_view = nnfw::tflite::TensorView<bool>::make(*tfl_interp, id); + auto nnapi_view = nnfw::tflite::TensorView<bool>::make(*nnapi, id); + + assert(tfl_interp_view.shape() == nnapi_view.shape()); + + auto fp = static_cast<bool (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &, + const ::nnfw::misc::tensor::Index &)>( + &RandomGenerator::generate<bool>); + const nnfw::misc::tensor::Object<bool> data(tfl_interp_view.shape(), + std::bind(fp, _randgen, _1, _2)); + + assert(tfl_interp_view.shape() == data.shape()); + + bool value = false; + + nnfw::misc::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::misc::tensor::Index &ind) { tfl_interp_view.at(ind) = value; nnapi_view.at(ind) = value; }; @@ -475,7 +535,7 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) } else { - nnfw::NNAPIDelegate d; + nnfw::tflite::NNAPIDelegate d; if (d.BuildGraph(nnapi.get())) { @@ -496,15 +556,15 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) auto equals = [tolerance](float lhs, float rhs) { // NOTE Hybrid approach // TODO Allow users to set tolerance for absolute_epsilon_equal - if (nnfw::util::fp32::absolute_epsilon_equal(lhs, rhs)) + if (nnfw::misc::fp32::absolute_epsilon_equal(lhs, rhs)) { return true; } - return nnfw::util::fp32::epsilon_equal(lhs, rhs, tolerance); + return nnfw::misc::fp32::epsilon_equal(lhs, rhs, tolerance); }; - nnfw::util::tensor::Comparator comparator(equals); + nnfw::misc::tensor::Comparator comparator(equals); TfLiteInterpMatchApp app(comparator); app.verbose() = _param.verbose; @@ -519,7 +579,7 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) std::cout << "[NNAPI TEST] PASSED" << std::endl; if (_param.tensor_logging) - nnfw::support::tflite::TensorLogger::instance().save(_param.log_path, *tfl_interp); + nnfw::tflite::TensorLogger::instance().save(_param.log_path, *tfl_interp); return 0; } @@ -531,8 +591,8 @@ RandomTestRunner RandomTestRunner::make(int seed) param.verbose = 0; param.tolerance = 1; - nnfw::util::env::IntAccessor("VERBOSE").access(param.verbose); - nnfw::util::env::IntAccessor("TOLERANCE").access(param.tolerance); + nnfw::misc::env::IntAccessor("VERBOSE").access(param.verbose); + nnfw::misc::env::IntAccessor("TOLERANCE").access(param.tolerance); return RandomTestRunner{seed, param}; } diff --git a/libs/support/tflite/src/FeatureView.cpp b/libs/tflite/src/FeatureView.cpp index 4c7636780..fdf5a4b00 100644 --- a/libs/support/tflite/src/FeatureView.cpp +++ b/libs/tflite/src/FeatureView.cpp @@ -14,21 +14,19 @@ * limitations under the License. */ -#include "support/tflite/FeatureView.h" -#include "support/tflite/TensorUtils.h" +#include "tflite/FeatureView.h" +#include "tflite/TensorUtils.h" #include <cassert> namespace nnfw { -namespace support -{ namespace tflite { -nnfw::util::feature::Shape getFeatureShape(const TfLiteTensor *tensor) +nnfw::misc::feature::Shape getFeatureShape(const TfLiteTensor *tensor) { - nnfw::util::feature::Shape shape{tensor->dims->data[3], tensor->dims->data[1], + nnfw::misc::feature::Shape shape{tensor->dims->data[3], tensor->dims->data[1], tensor->dims->data[2]}; return shape; @@ -69,5 +67,4 @@ float &FeatureView<float>::at(uint32_t ch, uint32_t row, uint32_t col) } } // namespace tflite -} // namespace support } // namespace nnfw diff --git a/libs/support/tflite/src/Quantization.cpp b/libs/tflite/src/Quantization.cpp index b23204d41..9c162c342 100644 --- a/libs/support/tflite/src/Quantization.cpp +++ b/libs/tflite/src/Quantization.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "support/tflite/Quantization.h" +#include "tflite/Quantization.h" TfLiteQuantizationParams make_default_quantization(void) { diff --git a/libs/support/tflite/src/TensorShapeUtils.cpp b/libs/tflite/src/TensorShapeUtils.cpp index 611ba920e..b5d906719 100644 --- a/libs/support/tflite/src/TensorShapeUtils.cpp +++ b/libs/tflite/src/TensorShapeUtils.cpp @@ -1,14 +1,12 @@ -#include "support/tflite/TensorShapeUtils.h" +#include "tflite/TensorShapeUtils.h" namespace nnfw { -namespace support -{ namespace tflite { -nnfw::util::tensor::Shape broadcast(const nnfw::util::tensor::Shape &lhs_shape, - const nnfw::util::tensor::Shape &rhs_shape) +nnfw::misc::tensor::Shape broadcast(const nnfw::misc::tensor::Shape &lhs_shape, + const nnfw::misc::tensor::Shape &rhs_shape) { const uint32_t lhs_rank = lhs_shape.rank(); const uint32_t rhs_rank = rhs_shape.rank(); @@ -36,7 +34,7 @@ nnfw::util::tensor::Shape broadcast(const nnfw::util::tensor::Shape &lhs_shape, rhs_normalized_dims.emplace_back(rhs_shape.dim(axis)); } - nnfw::util::tensor::Shape out_shape(out_rank); + nnfw::misc::tensor::Shape out_shape(out_rank); for (uint32_t axis = 0; axis < out_rank; ++axis) { @@ -47,5 +45,4 @@ nnfw::util::tensor::Shape broadcast(const nnfw::util::tensor::Shape &lhs_shape, } } // namespace tflite -} // namespace support } // namespace nnfw diff --git a/libs/tflite/src/TensorView.test.cpp b/libs/tflite/src/TensorView.test.cpp new file mode 100644 index 000000000..c710b3c33 --- /dev/null +++ b/libs/tflite/src/TensorView.test.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tflite/TensorView.h" + +#include <cassert> + +void int_test(void) +{ + int value[6] = {1, 2, 3, 4, 5, 6}; + + const nnfw::misc::tensor::Shape shape{2, 3}; + const nnfw::tflite::TensorView<int> view{shape, value}; + + assert(view.at(nnfw::misc::tensor::Index{0, 0}) == 1); + assert(view.at(nnfw::misc::tensor::Index{0, 1}) == 2); + assert(view.at(nnfw::misc::tensor::Index{0, 2}) == 3); + assert(view.at(nnfw::misc::tensor::Index{1, 0}) == 4); + assert(view.at(nnfw::misc::tensor::Index{1, 1}) == 5); + assert(view.at(nnfw::misc::tensor::Index{1, 2}) == 6); +} + +int main(int argc, char **argv) +{ + float value[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + + const nnfw::misc::tensor::Shape shape{2, 3}; + const nnfw::tflite::TensorView<float> view{shape, value}; + + assert(view.at(nnfw::misc::tensor::Index{0, 0}) == 1.0f); + assert(view.at(nnfw::misc::tensor::Index{0, 1}) == 2.0f); + assert(view.at(nnfw::misc::tensor::Index{0, 2}) == 3.0f); + assert(view.at(nnfw::misc::tensor::Index{1, 0}) == 4.0f); + assert(view.at(nnfw::misc::tensor::Index{1, 1}) == 5.0f); + assert(view.at(nnfw::misc::tensor::Index{1, 2}) == 6.0f); + + int_test(); + + return 0; +} diff --git a/libs/tflite/src/ext/kernels/Abs.cpp b/libs/tflite/src/ext/kernels/Abs.cpp new file mode 100644 index 000000000..7e9c2338d --- /dev/null +++ b/libs/tflite/src/ext/kernels/Abs.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tflite/ext/kernels/Abs.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +#include <iostream> +#include <cmath> + +namespace nnfw +{ +namespace tflite +{ +namespace custom +{ +namespace Abs +{ + +void *InitAbs(TfLiteContext *context, const char *buffer, size_t length) { return nullptr; } + +void FreeAbs(TfLiteContext *context, void *buffer) {} + +TfLiteStatus PrepareAbs(TfLiteContext *context, TfLiteNode *node) +{ + TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1); + + const TfLiteTensor *input = ::tflite::GetInput(context, node, 0); + TfLiteTensor *output = ::tflite::GetOutput(context, node, 0); + + TF_LITE_ENSURE_EQ(context, input->type, output->type); + + return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims)); +} + +TfLiteStatus EvalAbs(TfLiteContext *context, TfLiteNode *node) +{ + const TfLiteTensor *input = ::tflite::GetInput(context, node, 0); + TfLiteTensor *output = ::tflite::GetOutput(context, node, 0); + size_t elements = ::tflite::NumElements(input); + switch (input->type) + { + case kTfLiteFloat32: + { + auto *in = input->data.f; + auto *in_end = in + elements; + auto *out = output->data.f; + for (; in < in_end; in++, out++) + *out = std::abs(*in); + return kTfLiteOk; + } + case kTfLiteInt32: + { + auto *in = input->data.i32; + auto *in_end = in + elements; + auto *out = output->data.i32; + for (; in < in_end; in++, out++) + *out = std::abs(*in); + return kTfLiteOk; + } + case kTfLiteInt64: + { + auto *in = input->data.i64; + auto *in_end = in + elements; + auto *out = output->data.i64; + for (; in < in_end; in++, out++) + *out = std::abs(*in); + return kTfLiteOk; + } + case kTfLiteUInt8: + { + auto *in = input->data.uint8; + auto *in_end = in + elements; + auto *out = output->data.uint8; + for (; in < in_end; in++, out++) + *out = std::abs(*in); + return kTfLiteOk; + } + default: + { + context->ReportError(context, "Input type %d is not supported", input->type); + return kTfLiteError; + } + } +} + +} // namespace Abs +} // namespace custom +} // namespace tflite +} // namespace nnfw diff --git a/libs/support/tflite/src/kernels/SquaredDifference.cpp b/libs/tflite/src/ext/kernels/SquaredDifference.cpp index 25e10a8ed..8ac2b1de0 100644 --- a/libs/support/tflite/src/kernels/SquaredDifference.cpp +++ b/libs/tflite/src/ext/kernels/SquaredDifference.cpp @@ -14,19 +14,17 @@ * limitations under the License. */ -#include "support/tflite/kernels/SquaredDifference.h" +#include "tflite/ext/kernels/SquaredDifference.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include <iostream> -namespace tflite +namespace nnfw { -namespace ops +namespace tflite { namespace custom { -namespace nnfw -{ namespace SquaredDifference { @@ -39,12 +37,12 @@ void FreeSquaredDifference(TfLiteContext *context, void *buffer) {} TfLiteStatus PrepareSquaredDifference(TfLiteContext *context, TfLiteNode *node) { - TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1); - const TfLiteTensor *input1 = GetInput(context, node, 0); - const TfLiteTensor *input2 = GetInput(context, node, 1); - TfLiteTensor *output = GetOutput(context, node, 0); + const TfLiteTensor *input1 = ::tflite::GetInput(context, node, 0); + const TfLiteTensor *input2 = ::tflite::GetInput(context, node, 1); + TfLiteTensor *output = ::tflite::GetOutput(context, node, 0); TF_LITE_ENSURE_EQ(context, input1->type, input2->type); TF_LITE_ENSURE_EQ(context, input1->type, output->type); @@ -55,12 +53,12 @@ TfLiteStatus PrepareSquaredDifference(TfLiteContext *context, TfLiteNode *node) TfLiteStatus EvalSquaredDifference(TfLiteContext *context, TfLiteNode *node) { - const TfLiteTensor *input1 = GetInput(context, node, 0); - const TfLiteTensor *input2 = GetInput(context, node, 1); + const TfLiteTensor *input1 = ::tflite::GetInput(context, node, 0); + const TfLiteTensor *input2 = ::tflite::GetInput(context, node, 1); - TfLiteTensor *output = GetOutput(context, node, 0); + TfLiteTensor *output = ::tflite::GetOutput(context, node, 0); - size_t elements = NumElements(input1); + size_t elements = ::tflite::NumElements(input1); switch (input1->type) { @@ -109,7 +107,6 @@ TfLiteStatus EvalSquaredDifference(TfLiteContext *context, TfLiteNode *node) } } // namespace SquaredDifference -} // nnfw } // namespace custom -} // namespace ops } // namespace tflite +} // namespace nnfw diff --git a/libs/support/tflite/src/kernels/TensorFlowMax.cpp b/libs/tflite/src/ext/kernels/TensorFlowMax.cpp index abc6fda4e..d72ad242c 100644 --- a/libs/support/tflite/src/kernels/TensorFlowMax.cpp +++ b/libs/tflite/src/ext/kernels/TensorFlowMax.cpp @@ -14,19 +14,17 @@ * limitations under the License. */ -#include "support/tflite/kernels/TensorFlowMax.h" +#include "tflite/ext/kernels/TensorFlowMax.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include <iostream> -namespace tflite +namespace nnfw { -namespace ops +namespace tflite { namespace custom { -namespace nnfw -{ namespace TensorFlowMax { @@ -34,9 +32,9 @@ struct TensorFlowMaxOp { TensorFlowMaxOp(TfLiteContext *context, TfLiteNode *node) { - input = tflite::GetInput(context, node, 0); - axis = tflite::GetInput(context, node, 1); - output = tflite::GetOutput(context, node, 0); + input = ::tflite::GetInput(context, node, 0); + axis = ::tflite::GetInput(context, node, 1); + output = ::tflite::GetOutput(context, node, 0); } const TfLiteTensor *input; const TfLiteTensor *axis; @@ -62,16 +60,16 @@ TfLiteStatus ResizeTempAxis(TfLiteContext *context, TensorFlowMaxOp *op_context, TfLiteTensor *resolved_axis) { TfLiteIntArray *axis_size = TfLiteIntArrayCreate(1); - axis_size->data[0] = static_cast<int>(tflite::NumElements(op_context->axis)); + axis_size->data[0] = static_cast<int>(::tflite::NumElements(op_context->axis)); return context->ResizeTensor(context, resolved_axis, axis_size); } // Resizes output array based on the input size and resolved axis. TfLiteStatus ResizeOutputTensor(TfLiteContext *context, TensorFlowMaxOp *op_context) { - size_t num_axis = tflite::NumElements(op_context->axis); - const TfLiteIntArray *input_dims = op_context->input->dims; - int input_num_dims = tflite::NumDimensions(op_context->input); + size_t num_axis = ::tflite::NumElements(op_context->axis); + TfLiteIntArray *input_dims = op_context->input->dims; + int input_num_dims = ::tflite::NumDimensions(op_context->input); const int *axis = op_context->axis->data.i32; { @@ -100,26 +98,43 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext *context, TensorFlowMaxOp *op_cont } } // Determines output dimensions. - TfLiteIntArray *output_dims = TfLiteIntArrayCreate(input_num_dims - num_reduce_axis); - int num_skip_axis = 0; - for (int idx = 0; idx < input_num_dims; ++idx) + int output_num_dims = ::tflite::NumDimensions(op_context->output); + TF_LITE_ENSURE(context, (input_num_dims == output_num_dims) || + (input_num_dims - num_reduce_axis == output_num_dims)); + + if (input_num_dims == output_num_dims) { - bool is_axis = false; + TfLiteIntArray *output_dims = TfLiteIntArrayCopy(input_dims); for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) { - if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) - { - ++num_skip_axis; - is_axis = true; - break; - } + int current = axis[axis_idx]; + output_dims->data[current] = 1; } - if (!is_axis) + return context->ResizeTensor(context, op_context->output, output_dims); + } + else + { + TfLiteIntArray *output_dims = TfLiteIntArrayCreate(output_num_dims); + int num_skip_axis = 0; + for (int idx = 0; idx < input_num_dims; ++idx) { - output_dims->data[idx - num_skip_axis] = input_dims->data[idx]; + bool is_axis = false; + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) + { + ++num_skip_axis; + is_axis = true; + break; + } + } + if (!is_axis) + { + output_dims->data[idx - num_skip_axis] = input_dims->data[idx]; + } } + return context->ResizeTensor(context, op_context->output, output_dims); } - return context->ResizeTensor(context, op_context->output, output_dims); } } @@ -136,7 +151,7 @@ TfLiteStatus InitializeTemporaries(TfLiteContext *context, TfLiteNode *node, scratch_tensor->type = kTfLiteInt32; scratch_tensor->allocation_type = kTfLiteArenaRw; TfLiteIntArray *index_size = TfLiteIntArrayCreate(1); - index_size->data[0] = tflite::NumDimensions(op_context->input); + index_size->data[0] = ::tflite::NumDimensions(op_context->input); TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor, index_size)); // Creates a temp tensor to store resolved axis given input data. @@ -148,18 +163,18 @@ TfLiteStatus InitializeTemporaries(TfLiteContext *context, TfLiteNode *node, TfLiteStatus PrepareTensorFlowMax(TfLiteContext *context, TfLiteNode *node) { - TF_LITE_ENSURE_EQ(context, tflite::NumInputs(node), 2); - TF_LITE_ENSURE_EQ(context, tflite::NumOutputs(node), 1); + TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1); TensorFlowMaxOp op_context(context, node); TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context)); TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]]; // Leaves work to Eval if axis is not constant; else resizes output. - if (!tflite::IsConstantTensor(op_context.axis)) + if (!::tflite::IsConstantTensor(op_context.axis)) { - tflite::SetTensorToDynamic(op_context.output); - tflite::SetTensorToDynamic(resolved_axis); + ::tflite::SetTensorToDynamic(op_context.output); + ::tflite::SetTensorToDynamic(resolved_axis); return kTfLiteOk; } resolved_axis->allocation_type = kTfLiteArenaRw; @@ -336,11 +351,11 @@ TfLiteStatus EvalTensorFlowMax(TfLiteContext *context, TfLiteNode *node) { TensorFlowMaxOp op_context(context, node); - int num_axis = static_cast<int>(tflite::NumElements(op_context.axis)); + int num_axis = static_cast<int>(::tflite::NumElements(op_context.axis)); TfLiteTensor *temp_index = &context->tensors[node->temporaries->data[0]]; TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]]; // Resize the output tensor if the output tensor is dynamic. - if (tflite::IsDynamicTensor(op_context.output)) + if (::tflite::IsDynamicTensor(op_context.output)) { TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis)); TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); @@ -383,8 +398,8 @@ TfLiteStatus EvalTensorFlowMax(TfLiteContext *context, TfLiteNode *node) return returnStatus; } + } // namespace TensorFlowMax -} // namespace nnfw } // namespace custom -} // namespace ops } // namespace tflite +} // namespace nnfw diff --git a/libs/tflite/src/ext/kernels/TensorFlowSum.cpp b/libs/tflite/src/ext/kernels/TensorFlowSum.cpp new file mode 100644 index 000000000..cbf97970c --- /dev/null +++ b/libs/tflite/src/ext/kernels/TensorFlowSum.cpp @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tflite/ext/kernels/TensorFlowSum.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +#include <iostream> + +namespace nnfw +{ +namespace tflite +{ +namespace custom +{ +namespace TensorFlowSum +{ + +struct TensorFlowSumOp +{ + TensorFlowSumOp(TfLiteContext *context, TfLiteNode *node) + { + input = ::tflite::GetInput(context, node, 0); + axis = ::tflite::GetInput(context, node, 1); + output = ::tflite::GetOutput(context, node, 0); + } + const TfLiteTensor *input; + const TfLiteTensor *axis; + TfLiteTensor *output; +}; + +void *InitTensorFlowSum(TfLiteContext *context, const char *buffer, size_t length) +{ + // Creates two temp tensors to store index and axis for internal + // implementation only. + auto *scratch_tensor_index = new int; + context->AddTensors(context, 2, scratch_tensor_index); + return scratch_tensor_index; +} + +void FreeTensorFlowSum(TfLiteContext *context, void *buffer) +{ + delete static_cast<TensorFlowSumOp *>(buffer); +} + +// Resizes the temp tensor that stores resolved axis. +TfLiteStatus ResizeTempAxis(TfLiteContext *context, TensorFlowSumOp *op_context, + TfLiteTensor *resolved_axis) +{ + TfLiteIntArray *axis_size = TfLiteIntArrayCreate(1); + axis_size->data[0] = static_cast<int>(::tflite::NumElements(op_context->axis)); + return context->ResizeTensor(context, resolved_axis, axis_size); +} + +// Resizes output array based on the input size and resolved axis. +TfLiteStatus ResizeOutputTensor(TfLiteContext *context, TensorFlowSumOp *op_context) +{ + size_t num_axis = ::tflite::NumElements(op_context->axis); + TfLiteIntArray *input_dims = op_context->input->dims; + int input_num_dims = ::tflite::NumDimensions(op_context->input); + const int *axis = op_context->axis->data.i32; + + { + // Calculates size of reducing axis. + int num_reduce_axis = num_axis; + for (int i = 0; i < num_axis; ++i) + { + int current = axis[i]; + if (current < 0) + { + current += input_num_dims; + } + TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims); + for (int j = 0; j < i; ++j) + { + int previous = axis[j]; + if (previous < 0) + { + previous += input_num_dims; + } + if (current == previous) + { + --num_reduce_axis; + break; + } + } + } + // Determines output dimensions. + int output_num_dims = ::tflite::NumDimensions(op_context->output); + TF_LITE_ENSURE(context, (input_num_dims == output_num_dims) || + (input_num_dims - num_reduce_axis == output_num_dims)); + + if (input_num_dims == output_num_dims) + { + TfLiteIntArray *output_dims = TfLiteIntArrayCopy(input_dims); + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + int current = axis[axis_idx]; + output_dims->data[current] = 1; + } + return context->ResizeTensor(context, op_context->output, output_dims); + } + else + { + TfLiteIntArray *output_dims = TfLiteIntArrayCreate(output_num_dims); + int num_skip_axis = 0; + for (int idx = 0; idx < input_num_dims; ++idx) + { + bool is_axis = false; + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) + { + ++num_skip_axis; + is_axis = true; + break; + } + } + if (!is_axis) + { + output_dims->data[idx - num_skip_axis] = input_dims->data[idx]; + } + } + return context->ResizeTensor(context, op_context->output, output_dims); + } + } +} + +// Initializes temp tensors to store index and resolved axis. +TfLiteStatus InitializeTemporaries(TfLiteContext *context, TfLiteNode *node, + TensorFlowSumOp *op_context) +{ + // Creates a temp index to iterate through input data. + int *scratch_tensor_index = reinterpret_cast<int *>(node->user_data); + TfLiteIntArrayFree(node->temporaries); + node->temporaries = TfLiteIntArrayCreate(2); + node->temporaries->data[0] = *scratch_tensor_index; + TfLiteTensor *scratch_tensor = &context->tensors[node->temporaries->data[0]]; + scratch_tensor->type = kTfLiteInt32; + scratch_tensor->allocation_type = kTfLiteArenaRw; + TfLiteIntArray *index_size = TfLiteIntArrayCreate(1); + index_size->data[0] = ::tflite::NumDimensions(op_context->input); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor, index_size)); + + // Creates a temp tensor to store resolved axis given input data. + node->temporaries->data[1] = *scratch_tensor_index + 1; + TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]]; + resolved_axis->type = kTfLiteInt32; + return kTfLiteOk; +} + +TfLiteStatus PrepareTensorFlowSum(TfLiteContext *context, TfLiteNode *node) +{ + TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1); + + TensorFlowSumOp op_context(context, node); + TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context)); + + TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]]; + // Leaves work to Eval if axis is not constant; else resizes output. + if (!::tflite::IsConstantTensor(op_context.axis)) + { + ::tflite::SetTensorToDynamic(op_context.output); + ::tflite::SetTensorToDynamic(resolved_axis); + return kTfLiteOk; + } + resolved_axis->allocation_type = kTfLiteArenaRw; + TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis)); + return ResizeOutputTensor(context, &op_context); +} + +// Gets offset of index if expanded on axis. When expanded, the flattened offset +// will not change, if the output index changes on the given axis. For example, +// if you have a 2D tensor and you are expanding to 3D on axis 0, +// then index (0, 1, 2) and index (1, 1, 2) will map from the same flattened +// offset. +inline size_t ExpandedInputOffset(const int num_dims, const int *dims, const int *index, + const int num_axis, const int *axis) +{ + size_t offset = 0; + int out_idx = 0; + for (int in_idx = 0; in_idx < num_dims; ++in_idx) + { + // if we need to expand this axis + bool is_axis = false; + if (axis != nullptr) + { + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + if (in_idx == axis[axis_idx]) + { + is_axis = true; + break; + } + } + } + if (!is_axis) + { + offset = offset * static_cast<size_t>(dims[in_idx]) + static_cast<size_t>(index[out_idx]); + out_idx++; + } + else + { + offset = offset * static_cast<size_t>(dims[in_idx]); + } + } + return offset; +} + +// Gets offset of index if reducing on axis. When reducing, the flattened offset +// will not change, if the input index changes on the given axis. For example, +// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0, +// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened +// offset. +// TODO(kanlig): uses Dims to represent dimensions. +inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, + const int num_axis, const int *axis) +{ + size_t offset = 0; + for (int idx = 0; idx < num_dims; ++idx) + { + // if we need to skip this axis + bool is_axis = false; + if (axis != nullptr) + { + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + if (idx == axis[axis_idx]) + { + is_axis = true; + break; + } + } + } + if (!is_axis) + { + offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]); + } + } + return offset; +} + +// Gets next index to iterate through a multidimensional array. +inline bool NextIndex(TfLiteContext *context, const int num_dims, const int *dims, int *current) +{ + int carry = 1; + for (int idx = num_dims - 1; idx >= 0; --idx) + { + int current_val = current[idx] + carry; + TF_LITE_ENSURE(context, (dims[idx] >= current_val)); + if (dims[idx] == current_val) + { + current[idx] = 0; + } + else + { + current[idx] = current_val; + carry = 0; + break; + } + } + return (carry == 0); +} + +template <typename T> +inline TfLiteStatus +CustomSum(TfLiteContext *context, T *input_data, const int *input_dims, const int input_num_dims, + T *output_data, const int *output_dims, const int output_num_dims, const int *axis, + const int num_axis_dimensions, bool keep_dims, int *temp_index, int *resolved_axis) +{ + // resolves axis. + int num_resolved_axis = 0; + for (int idx = 0; idx < num_axis_dimensions; ++idx) + { + int current = axis[idx]; + TF_LITE_ENSURE(context, (current < input_num_dims && current + input_num_dims >= 0)); + if (current < 0) + { + current += input_num_dims; + } + bool is_dup = false; + for (int j = 0; j < num_resolved_axis; ++j) + { + if (resolved_axis[j] == current) + { + is_dup = true; + break; + } + } + if (!is_dup) + { + resolved_axis[num_resolved_axis++] = current; + } + } + + TF_LITE_ENSURE(context, (input_num_dims > 0)); + TF_LITE_ENSURE(context, (input_dims != nullptr)); + TF_LITE_ENSURE(context, (temp_index != nullptr)); + + // resets output data. + for (int idx = 0; idx < output_num_dims; ++idx) + { + temp_index[idx] = 0; + } + for (bool has_next = true; has_next; + has_next = NextIndex(context, output_num_dims, output_dims, temp_index)) + { + size_t output_offset = + ReducedOutputOffset(output_num_dims, output_dims, temp_index, 0, nullptr); + output_data[output_offset] = 0; + } + + // resets temp index. + for (int idx = 0; idx < input_num_dims; ++idx) + { + temp_index[idx] = 0; + } + + // iterates through input_data. + for (bool has_next = true; has_next; + has_next = NextIndex(context, input_num_dims, input_dims, temp_index)) + { + size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr); + size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims, temp_index, + num_resolved_axis, resolved_axis); + output_data[output_offset] += input_data[input_offset]; + } + + return kTfLiteOk; +} + +TfLiteStatus EvalTensorFlowSum(TfLiteContext *context, TfLiteNode *node) +{ + + TensorFlowSumOp op_context(context, node); + int num_axis = static_cast<int>(::tflite::NumElements(op_context.axis)); + TfLiteTensor *temp_index = &context->tensors[node->temporaries->data[0]]; + TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]]; + // Resize the output tensor if the output tensor is dynamic. + if (::tflite::IsDynamicTensor(op_context.output)) + { + TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis)); + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + } + + TfLiteStatus returnStatus = kTfLiteOk; + switch (op_context.input->type) + { + case kTfLiteFloat32: + returnStatus = CustomSum<float>( + context, op_context.input->data.f, op_context.input->dims->data, + op_context.input->dims->size, op_context.output->data.f, op_context.output->dims->data, + op_context.output->dims->size, op_context.axis->data.i32, num_axis, false, + temp_index->data.i32, resolved_axis->data.i32); + break; + case kTfLiteInt32: + returnStatus = CustomSum<int>(context, op_context.input->data.i32, + op_context.input->dims->data, op_context.input->dims->size, + op_context.output->data.i32, op_context.output->dims->data, + op_context.output->dims->size, op_context.axis->data.i32, + num_axis, false, temp_index->data.i32, resolved_axis->data.i32); + break; + case kTfLiteUInt8: + returnStatus = CustomSum<uint8_t>( + context, op_context.input->data.uint8, op_context.input->dims->data, + op_context.input->dims->size, op_context.output->data.uint8, + op_context.output->dims->data, op_context.output->dims->size, op_context.axis->data.i32, + num_axis, false, temp_index->data.i32, resolved_axis->data.i32); + break; + case kTfLiteInt64: + returnStatus = CustomSum<int64_t>( + context, op_context.input->data.i64, op_context.input->dims->data, + op_context.input->dims->size, op_context.output->data.i64, op_context.output->dims->data, + op_context.output->dims->size, op_context.axis->data.i32, num_axis, false, + temp_index->data.i32, resolved_axis->data.i32); + break; + default: + returnStatus = kTfLiteError; + } + + return returnStatus; +} + +} // namespace TensorFlowSum +} // namespace custom +} // namespace tflite +} // namespace nnfw diff --git a/libs/support/tflite/src/kernels/register.cpp b/libs/tflite/src/ext/kernels/register.cpp index 6700b4de4..b822bd616 100644 --- a/libs/support/tflite/src/kernels/register.cpp +++ b/libs/tflite/src/ext/kernels/register.cpp @@ -14,18 +14,17 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + // NOTE This code is derived from the following file (in TensorFlow) // 'externals/tensorflow/tensorflow/contrib/lite/kernels/register.cc' -#include "support/tflite/kernels/register.h" -#include "support/tflite/kernels/CustomOps.h" +#include "tflite/ext/kernels/register.h" +#include "tflite/ext/kernels/CustomOps.h" -// TODO Use namespace nnfw -namespace tflite -{ -namespace ops -{ -namespace builtin -{ +namespace tflite { +namespace ops { +namespace builtin { TfLiteRegistration *Register_RELU(); TfLiteRegistration *Register_RELU_N1_TO_1(); @@ -91,9 +90,41 @@ TfLiteRegistration *Register_SLICE(); TfLiteRegistration *Register_SIN(); TfLiteRegistration *Register_TRANSPOSE_CONV(); TfLiteRegistration *Register_SPARSE_TO_DENSE(); +#ifndef OBS_BUILD +TfLiteRegistration *Register_SUM(); +TfLiteRegistration *Register_REDUCE_MAX(); +TfLiteRegistration *Register_REDUCE_MIN(); +TfLiteRegistration *Register_EQUAL(); +TfLiteRegistration *Register_NOT_EQUAL(); +TfLiteRegistration *Register_SQRT(); +TfLiteRegistration *Register_RSQRT(); +TfLiteRegistration *Register_SHAPE(); +TfLiteRegistration *Register_POW(); +TfLiteRegistration *Register_FAKE_QUANT(); +TfLiteRegistration *Register_PACK(); +TfLiteRegistration *Register_ONE_HOT(); +TfLiteRegistration *Register_LOGICAL_OR(); +TfLiteRegistration *Register_LOGICAL_AND(); +TfLiteRegistration *Register_LOGICAL_NOT(); +TfLiteRegistration *Register_UNPACK(); +TfLiteRegistration *Register_FLOOR_DIV(); +TfLiteRegistration *Register_SQUARE(); +TfLiteRegistration *Register_ZEROS_LIKE(); +#endif // OBS_BUILD + +} // namespace builtin +} // namespace ops +} // namespace tflite + +namespace nnfw { +namespace tflite { BuiltinOpResolver::BuiltinOpResolver() { + // Using namespace directive to minimize diff with upstream tensorflow + using namespace ::tflite::ops::builtin; + using namespace ::tflite; + AddBuiltin(BuiltinOperator_RELU, Register_RELU()); AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1()); AddBuiltin(BuiltinOperator_RELU6, Register_RELU6()); @@ -156,14 +187,35 @@ BuiltinOpResolver::BuiltinOpResolver() AddBuiltin(BuiltinOperator_SELECT, Register_SELECT()); AddBuiltin(BuiltinOperator_SLICE, Register_SLICE()); AddBuiltin(BuiltinOperator_SIN, Register_SIN()); +#ifndef OBS_BUILD + AddBuiltin(BuiltinOperator_SUM, Register_SUM()); + AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX()); + AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN()); AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV()); AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE()); + AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL()); + AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL()); + AddBuiltin(BuiltinOperator_SQRT, Register_SQRT()); + AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT()); + AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE()); + AddBuiltin(BuiltinOperator_POW, Register_POW()); + AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2); + AddBuiltin(BuiltinOperator_PACK, Register_PACK()); + AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT()); + AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR()); + AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND()); + AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT()); + AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK()); + AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV()); + AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE()); + AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE()); +#endif // OBS_BUILD - AddCustom("TensorFlowMax", tflite::ops::custom::nnfw::Register_TensorFlowMax()); - AddCustom("RSQRT", tflite::ops::custom::nnfw::Register_RSQRT()); - AddCustom("SquaredDifference", tflite::ops::custom::nnfw::Register_SquaredDifference()); + AddCustom("TensorFlowMax", nnfw::tflite::custom::Register_TensorFlowMax()); + AddCustom("SquaredDifference", nnfw::tflite::custom::Register_SquaredDifference()); + AddCustom("TensorFlowSum", nnfw::tflite::custom::Register_TensorFlowSum()); + AddCustom("Abs", nnfw::tflite::custom::Register_Abs()); } -} // namespace builtin -} // namespace ops -} // namespace tflite +} // namespace tflite +} // namespace nnfw diff --git a/libs/tflite/src/ext/nnapi_delegate.cpp b/libs/tflite/src/ext/nnapi_delegate.cpp new file mode 100644 index 000000000..25858a7b4 --- /dev/null +++ b/libs/tflite/src/ext/nnapi_delegate.cpp @@ -0,0 +1,1209 @@ +/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + +// NOTE This code is derived from the following file (in TensorFlow v1.12) +// 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.cc' +#include "tflite/ext/nnapi_delegate.h" +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#ifdef OBS_BUILD +#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/error_reporter.h" +#else +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/core/api/error_reporter.h" +#endif +#include "tensorflow/contrib/lite/model.h" +#include "NeuralNetworksShim.h" +#include "NeuralNetworksExShim.h" + +#ifdef __ANDROID__ +#include <android/log.h> +#include <sys/system_properties.h> +#endif + +namespace nnfw { +namespace tflite { + +void logError(const char* format, ...) { + // stderr is convenient for native tests, but is not captured for apps + va_list args_for_stderr; + va_start(args_for_stderr, format); + vfprintf(stderr, format, args_for_stderr); + va_end(args_for_stderr); + fprintf(stderr, "\n"); + fflush(stderr); +#ifdef __ANDROID__ + // produce logcat output for general consumption + va_list args_for_log; + va_start(args_for_log, format); + __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log); + va_end(args_for_log); +#endif +} + +#define FATAL(...) \ + logError(__VA_ARGS__); \ + exit(1); + +// TODO(aselle): Change the error model to use status codes. +#define CHECK_TFLITE_SUCCESS(x) \ + if (x != kTfLiteOk) { \ + FATAL("Aborting since tflite returned failure nnapi_delegate.cc:%d.", \ + __LINE__); \ + } + +#define CHECK_NN(x) \ + if (x != ANEURALNETWORKS_NO_ERROR) { \ + FATAL("Aborting since NNAPI returned failure nnapi_delegate.cc:%d", \ + __LINE__); \ + } + +#define RETURN_ERROR_IF_TFLITE_FAILED(x) \ + if (x != kTfLiteOk) { \ + logError( \ + "Returning error since TFLite returned failure nnapi_delegate.cc:%d.", \ + __LINE__); \ + return kTfLiteError; \ + } + +#define RETURN_ERROR_IF_NN_FAILED(x) \ + if (x != ANEURALNETWORKS_NO_ERROR) { \ + logError( \ + "Returning error since NNAPI returned failure nnapi_delegate.cc:%d.", \ + __LINE__); \ + return kTfLiteError; \ + } + +// Tracking of NNAPI operand ids +static const int64_t kOperandIdNotSet = -1; +static const int64_t kOperandNotNeeded = -2; + +namespace { + +int32_t GetAndroidSdkVersion() { +#ifdef __ANDROID__ + const char* sdkProp = "ro.build.version.sdk"; + char sdkVersion[PROP_VALUE_MAX]; + int length = __system_property_get(sdkProp, sdkVersion); + if (length != 0) { + for (int i = 0; i < length; ++i) { + int digit = sdkVersion[i] - '0'; + if (digit < 0 || digit > 9) { + // Non-numeric SDK version, assume it's higher then expected; + return 0xFFFF; + } + } + return atoi(sdkVersion); + } + FATAL("No %s prop", sdkProp); +#endif // __ANDROID__ + return 0; +} + +int32_t GetAndroidSdkVersionCached() { + static int32_t androidSdkVersion = GetAndroidSdkVersion(); + return androidSdkVersion; +} + +static const uint32_t dimension_for_scalar[1] = {1}; + +} // namespace + +NNAPIAllocation::NNAPIAllocation(const char* filename, + ::tflite::ErrorReporter* error_reporter) + : MMAPAllocation(filename, error_reporter) { + if (mmapped_buffer_ != MAP_FAILED) + CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ, + mmap_fd_, 0, &handle_)); +} + +NNAPIAllocation::~NNAPIAllocation() { + if (handle_) { + ANeuralNetworksMemory_free(handle_); + } +} + +NNAPIDelegate::~NNAPIDelegate() { + if (nn_compiled_model_) { + ANeuralNetworksCompilation_free(nn_compiled_model_); + nn_compiled_model_ = nullptr; + } + if (nn_model_) { + ANeuralNetworksModel_free(nn_model_); + nn_model_ = nullptr; + // TODO(aselle): Is this thread-safe and callable multiple times? + } + // ANeuralNetworksShutdown(); +} + +// Adds the tensors of the interpreter to the NN API model. +TfLiteStatus addTensorOperands(::tflite::Interpreter* interpreter, + ANeuralNetworksModel* nn_model, + uint32_t* no_of_operands_added, + std::vector<int64_t>* nnapi_ids) { + uint32_t next_id = 0; + for (size_t i = 0; i < interpreter->tensors_size(); i++) { + // Skip temporaries and RNN back-edges. + if ((*nnapi_ids)[i] == kOperandNotNeeded) continue; + + (*nnapi_ids)[i] = int64_t(next_id); + + int32_t nn_type = 0; + // NNAPI requires 32-bit float scale to be zero, tflite doesn't care + float scale = 0.0f; + int32_t zeroPoint = 0; + TfLiteTensor* tensor = interpreter->tensor(i); + switch (tensor->type) { + case kTfLiteNoType: + // Tensors added during initialization of Ops don't have a type yet and + // should not be registered with the NNAPI. + continue; + case kTfLiteFloat32: + nn_type = ANEURALNETWORKS_TENSOR_FLOAT32; + break; + case kTfLiteUInt8: + nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM; + scale = tensor->params.scale; + // FIXME The next line is a workaround because currently zero scale is + // passed down from TF + // Lite. Note that the latest NeuralNetworks.h (see + // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/runtime/include/NeuralNetworks.h) + // requires scale to be greater than zero. Remove this workaround + // when the scale + // value is correctly passed. + scale = (scale == 0.0f) ? 1.0f : scale; + zeroPoint = tensor->params.zero_point; + break; + case kTfLiteInt32: + nn_type = ANEURALNETWORKS_TENSOR_INT32; + scale = tensor->params.scale; + zeroPoint = tensor->params.zero_point; + break; + case kTfLiteBool: + // Workaround to pass bool type under NNAPI + // Use bool type using ANEURALNETWORKS_TENSOR_QUANT8_ASYMM with scale = 1.0f and zero_point = 0 + nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM; + scale = 1.0f; + zeroPoint = 0; + break; + default: + logError("Unsupported tensor type %d", tensor->type); + return kTfLiteError; + } + if (tensor->dims->size == 0) { + // WORKAROUND Some model have dimension zero + switch (tensor->type) { + case kTfLiteFloat32: + nn_type = ANEURALNETWORKS_TENSOR_FLOAT32; + break; + case kTfLiteInt32: + nn_type = ANEURALNETWORKS_TENSOR_INT32; + break; + default: + logError("NNAPI doesn't support tensors with rank 0 (index %d name %s)", + i, tensor->name); + return kTfLiteError; + } + } + if (tensor->dims->size > 4) { + logError("NNAPI doesn't support tensors with rank > 4 (index %d name %s)", + i, tensor->name); + return kTfLiteError; + } + // TODO(aselle): Note, many of these are intermediate results. Do I need + // to ever specify these sizes. I am currently below doing setValue + // on all of them, but I shouldn't in the future. + // Answer(jeanluc): If all the operators can set the dimension correctly, + // you won't need to. + ANeuralNetworksOperandType operand_type{ + nn_type, static_cast<uint32_t>(tensor->dims->size), + reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint}; + if (tensor->dims->size == 0) { + // WORKAROUND Some model have dimension zero + // Consider scalar as vector size 1 + operand_type.dimensions = dimension_for_scalar; + operand_type.dimensionCount = 1; + } + RETURN_ERROR_IF_NN_FAILED( + ANeuralNetworksModel_addOperand(nn_model, &operand_type)); + // TODO(aselle): Based on Michael's suggestion, limiting this to read + // only memory + if (tensor->allocation_type == kTfLiteMmapRo) { + if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>( + static_cast<const ::tflite::Allocation*>(tensor->allocation))) { + RETURN_ERROR_IF_NN_FAILED( + ANeuralNetworksModel_setOperandValueFromMemory( + nn_model, next_id, alloc->memory(), + alloc->offset(tensor->data.raw), tensor->bytes)); + } else { + RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue( + nn_model, next_id, tensor->data.raw, tensor->bytes)); + } + } else if (tensor->bytes == 0) { + // These size 0 tensors are optional tensors reserved. + RETURN_ERROR_IF_NN_FAILED( + ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0)); + } + + ++next_id; + } + *no_of_operands_added = next_id; + return kTfLiteOk; +} + +void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count, + std::vector<uint32_t>* into, + const std::vector<int64_t>& map) { + for (size_t i = 0; i < from_ids_count; i++) { + int from_id = from_ids_buf[i]; + if (from_id == kOptionalTensor) { + into->push_back(from_id); + } else { + into->push_back(map[from_id]); + } + } +} + +// Adds the operations and their parameters to the NN API model. +// 'next-id' is the operand ID of the next operand of the model. +TfLiteStatus AddOpsAndParams( + ::tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model, + uint32_t next_id, std::vector<int>* model_state_inputs, + std::vector<int>* model_state_outputs, + const std::vector<int64_t>& tensor_id_to_nnapi_id) { + for (size_t i = 0; i < interpreter->nodes_size(); i++) { + const auto* node_and_registration = interpreter->node_and_registration(i); + const TfLiteNode& node = node_and_registration->first; + const TfLiteRegistration& registration = node_and_registration->second; + ::tflite::BuiltinOperator builtin = + static_cast<::tflite::BuiltinOperator>(registration.builtin_code); + + // Add the parameters. + std::vector<uint32_t> augmented_inputs, augmented_outputs; + MapAndAddTensorIds(node.inputs->data, node.inputs->size, &augmented_inputs, + tensor_id_to_nnapi_id); + MapAndAddTensorIds(node.outputs->data, node.outputs->size, + &augmented_outputs, tensor_id_to_nnapi_id); + + auto add_scalar_int32 = [&nn_model, &augmented_inputs, + &next_id](int value) { + ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) + CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value, + sizeof(int32_t))) + augmented_inputs.push_back(next_id++); + }; + + auto add_scalar_float32 = [&nn_model, &augmented_inputs, + &next_id](float value) { + ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) + CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value, + sizeof(float))) + augmented_inputs.push_back(next_id++); + }; + + auto add_vector_int32 = [&](const int* values, uint32_t num_values) { + ANeuralNetworksOperandType operand_type{ + .type = ANEURALNETWORKS_TENSOR_INT32, + .dimensionCount = 1, + .dimensions = &num_values}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) + CHECK_NN(ANeuralNetworksModel_setOperandValue( + nn_model, next_id, values, sizeof(int32_t) * num_values)); + augmented_inputs.push_back(next_id++); + }; + + // Handle state tensors of RNN, LSTM, SVDF. + // For each state_out tensor, a corresponding state_in operand needs to be + // created for NNAPI. + auto duplicate_state_tensor_float32 = + [interpreter, &nn_model, &next_id, &augmented_inputs, + &model_state_inputs, &model_state_outputs](int tensor_id) { + const TfLiteTensor* tensor = interpreter->tensor(tensor_id); + ANeuralNetworksOperandType operand_type{ + ANEURALNETWORKS_TENSOR_FLOAT32, + static_cast<uint32_t>(tensor->dims->size), + reinterpret_cast<uint32_t*>(tensor->dims->data), + tensor->params.scale, tensor->params.zero_point}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); + augmented_inputs.push_back(next_id); + model_state_inputs->push_back(next_id); + model_state_outputs->push_back(tensor_id); + next_id++; + }; + auto check_and_add_activation = [&add_scalar_int32](int activation) { + if (activation > kTfLiteActRelu6) { + logError("NNAPI only supports RELU, RELU1 and RELU6 activations"); + return kTfLiteError; + } + add_scalar_int32(activation); + return kTfLiteOk; + }; + + auto add_add_params = [&add_scalar_int32](void* data) { + auto* builtin = reinterpret_cast<TfLiteAddParams*>(data); + if (builtin->activation > kTfLiteActRelu6) { + logError("NNAPI only supports RELU, RELU1 and RELU6 activations"); + return kTfLiteError; + } + add_scalar_int32(builtin->activation); + return kTfLiteOk; + }; + + auto add_pooling_params = [&add_scalar_int32, + &check_and_add_activation](void* data) { + auto builtin = reinterpret_cast<TfLitePoolParams*>(data); + add_scalar_int32(builtin->padding); + add_scalar_int32(builtin->stride_width); + add_scalar_int32(builtin->stride_height); + add_scalar_int32(builtin->filter_width); + add_scalar_int32(builtin->filter_height); + return check_and_add_activation(builtin->activation); + }; + + auto add_convolution_params = [&add_scalar_int32, + &check_and_add_activation](void* data) { + auto builtin = reinterpret_cast<TfLiteConvParams*>(data); + add_scalar_int32(builtin->padding); + add_scalar_int32(builtin->stride_width); + add_scalar_int32(builtin->stride_height); + return check_and_add_activation(builtin->activation); + }; + + auto add_depthwise_conv_params = [&add_scalar_int32, + &check_and_add_activation](void* data) { + auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data); + add_scalar_int32(builtin->padding); + add_scalar_int32(builtin->stride_width); + add_scalar_int32(builtin->stride_height); + add_scalar_int32(builtin->depth_multiplier); + return check_and_add_activation(builtin->activation); + }; + + auto add_fully_connected_params = [&check_and_add_activation](void* data) { + auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data); + return check_and_add_activation(builtin->activation); + }; + + auto add_concatenation_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data); + add_scalar_int32(builtin->axis); + if (builtin->activation != kTfLiteActNone) { + logError("Concatenation does not support fused activation in NNAPI"); + return kTfLiteError; + } + return kTfLiteOk; + }; + + auto add_softmax_params = [&add_scalar_float32](void* data) { + auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data); + add_scalar_float32(builtin->beta); + }; + + auto add_space_to_depth_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data); + add_scalar_int32(builtin->block_size); + }; + + auto add_lstm_params = [&add_scalar_int32, + &add_scalar_float32](void* data) { + auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data); + add_scalar_int32(builtin->activation); + add_scalar_float32(builtin->cell_clip); + add_scalar_float32(builtin->proj_clip); + }; + + // LSTM in NNAPI requires scratch tensor as an output operand. + auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model, + &next_id, &augmented_outputs]() { + if (node.temporaries->size == 0) return; + int scratch_buffer_index = node.temporaries->data[0]; + const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index); + ANeuralNetworksOperandType operand_type{ + ANEURALNETWORKS_TENSOR_FLOAT32, + static_cast<uint32_t>(tensor->dims->size), + reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale, + tensor->params.zero_point}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); + augmented_outputs.insert(augmented_outputs.begin(), next_id++); + }; + + auto add_mean_params = [&add_scalar_int32](void* data) { +#ifdef OBS_BUILD + auto builtin = reinterpret_cast<TfLiteMeanParams*>(data); +#else + auto builtin = reinterpret_cast<TfLiteReducerParams*>(data); +#endif + add_scalar_int32(builtin->keep_dims); + }; + + auto add_svdf_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data); + add_scalar_int32(builtin->rank); + add_scalar_int32(builtin->activation); + }; + + auto add_rnn_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteRNNParams*>(data); + add_scalar_int32(builtin->activation); + }; + + auto add_squeeze_params = [&](void* data) { + const auto* builtin = reinterpret_cast<TfLiteSqueezeParams*>(data); + // Note that we add the squeeze dimensions even if the dimensions were + // unspecified (empty), as NNAPI requires the operand. + add_vector_int32(builtin->squeeze_dims, + static_cast<uint32_t>(builtin->num_squeeze_dims)); + }; + + // Handle optional input tensors. + auto add_optional_tensors = [&nn_model, &augmented_inputs, + &next_id](int nn_type) { + for (size_t idx = 0; idx < augmented_inputs.size(); idx++) { + if (augmented_inputs[idx] == kOptionalTensor) { + const std::vector<uint32_t> dim = {0, 0}; + ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) + CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, + nullptr, 0)) + augmented_inputs[idx] = next_id++; + } + } + }; + + int nnapi_version = 10; +#include "nnapi_delegate_ex_AddOpsAndParams_lambda.inc" + + ANeuralNetworksOperationType nn_op_type; + + // Using namespace directive to minimize diff with upstream tensorflow + namespace tflite = ::tflite; + + switch (builtin) { + case tflite::BuiltinOperator_ADD: + nn_op_type = ANEURALNETWORKS_ADD; + RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data)); + break; + case tflite::BuiltinOperator_MUL: + nn_op_type = ANEURALNETWORKS_MUL; + RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data)); + break; + case tflite::BuiltinOperator_AVERAGE_POOL_2D: + RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data)); + nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D; + break; + case tflite::BuiltinOperator_MAX_POOL_2D: + RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data)); + nn_op_type = ANEURALNETWORKS_MAX_POOL_2D; + break; + case tflite::BuiltinOperator_L2_POOL_2D: + RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data)); + nn_op_type = ANEURALNETWORKS_L2_POOL_2D; + break; + case tflite::BuiltinOperator_CONV_2D: { + auto builtin = reinterpret_cast<TfLiteConvParams*>(node.builtin_data); + if (builtin->dilation_width_factor != 1 || + builtin->dilation_height_factor != 1 || node.inputs->size != 3) { + logError("NNAPI does not support dilated Conv2D."); + return kTfLiteError; + } + } + RETURN_ERROR_IF_TFLITE_FAILED( + add_convolution_params(node.builtin_data)); + nn_op_type = ANEURALNETWORKS_CONV_2D; + break; + case tflite::BuiltinOperator_RELU: + nn_op_type = ANEURALNETWORKS_RELU; + break; + case tflite::BuiltinOperator_RELU_N1_TO_1: + nn_op_type = ANEURALNETWORKS_RELU1; + break; + case tflite::BuiltinOperator_RELU6: + nn_op_type = ANEURALNETWORKS_RELU6; + break; + case tflite::BuiltinOperator_TANH: + nn_op_type = ANEURALNETWORKS_TANH; + break; + case tflite::BuiltinOperator_FLOOR: + nn_op_type = ANEURALNETWORKS_FLOOR; + break; + case tflite::BuiltinOperator_LOGISTIC: + nn_op_type = ANEURALNETWORKS_LOGISTIC; + break; + case tflite::BuiltinOperator_DEPTHWISE_CONV_2D: + RETURN_ERROR_IF_TFLITE_FAILED( + add_depthwise_conv_params(node.builtin_data)); + nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D; + break; + case tflite::BuiltinOperator_CONCATENATION: + RETURN_ERROR_IF_TFLITE_FAILED( + add_concatenation_params(node.builtin_data)); + nn_op_type = ANEURALNETWORKS_CONCATENATION; + break; + case tflite::BuiltinOperator_SOFTMAX: + add_softmax_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_SOFTMAX; + break; + case tflite::BuiltinOperator_FULLY_CONNECTED: + RETURN_ERROR_IF_TFLITE_FAILED( + add_fully_connected_params(node.builtin_data)); + nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED; + break; + case tflite::BuiltinOperator_RESHAPE: + if (node.inputs->size != 2) { + logError("NNAPI only supports 2-input RESHAPE"); + return kTfLiteError; + } + nn_op_type = ANEURALNETWORKS_RESHAPE; + // add_reshape_params(node.builtin_data); + break; + case tflite::BuiltinOperator_RESIZE_BILINEAR: + add_resize_bilinear_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR; + break; + case tflite::BuiltinOperator_SPACE_TO_DEPTH: + add_space_to_depth_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH; + break; + case tflite::BuiltinOperator_LSTM: { + if (node.inputs->size + /* no of params */ 3 != 21) { + logError("NNAPI only supports 21-input LSTMs"); + return kTfLiteError; + } + duplicate_state_tensor_float32( + node.outputs->data[/*kOutputStateTensor*/ 0]); + duplicate_state_tensor_float32( + node.outputs->data[/*kCellStateTensor*/ 1]); + add_lstm_params(node.builtin_data); + add_lstm_scratch_tensor_float32(); + add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32); + nn_op_type = ANEURALNETWORKS_LSTM; + break; + } + case tflite::BuiltinOperator_DEQUANTIZE: + nn_op_type = ANEURALNETWORKS_DEQUANTIZE; + break; + case tflite::BuiltinOperator_SVDF: { + duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]); + add_svdf_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_SVDF; + break; + } + case tflite::BuiltinOperator_RNN: { + duplicate_state_tensor_float32( + node.outputs->data[/*kHiddenStateTensor*/ 0]); + add_rnn_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_RNN; + break; + } + case tflite::BuiltinOperator_EMBEDDING_LOOKUP: + nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP; + break; + case tflite::BuiltinOperator_PAD: + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_PAD; + break; + case tflite::BuiltinOperator_MEAN: + nnapi_version = 11; // require NNAPI 1.1 + add_mean_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_MEAN; + break; + case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: + nn_op_type = ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION; + add_lrn_params(node.builtin_data); + break; + case tflite::BuiltinOperator_DIV: + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_DIV; + RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation( + reinterpret_cast<TfLiteDivParams*>(node.builtin_data)->activation)); + break; + case tflite::BuiltinOperator_SUB: + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_SUB; + RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation( + reinterpret_cast<TfLiteSubParams*>(node.builtin_data)->activation)); + break; + case tflite::BuiltinOperator_SQUEEZE: + nnapi_version = 11; // requires NNAPI 1.1 + add_squeeze_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_SQUEEZE; + break; + case tflite::BuiltinOperator_TRANSPOSE: + // The permutation input tensor value dictates the output dimensions. + // TODO(b/110888333): Support dynamically-sized tensors in delegates. + if ((node.inputs->size > 1) && + (interpreter->tensor(node.inputs->data[1])->allocation_type != + kTfLiteMmapRo)) { + logError("NNAPI does not yet support dynamic tensors."); + return kTfLiteError; + } + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_TRANSPOSE; + break; + case tflite::BuiltinOperator_L2_NORMALIZATION: + nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION; + if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data) + ->activation != kTfLiteActNone) { + logError( + "NNAPI does not support L2Normalization with fused activations"); + return kTfLiteError; + } + if ((node.inputs->size > 0) && + (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) { + logError("NNAPI only supports input rank 4 for L2Normalization"); + return kTfLiteError; + } + break; + case tflite::BuiltinOperator_HASHTABLE_LOOKUP: + if (interpreter->tensor(node.outputs->data[0])->type != + kTfLiteFloat32) { + logError("NNAPI only support HASHTABLE_LOOKUP with float32 output", + builtin); + return kTfLiteError; + } + nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP; + break; + case tflite::BuiltinOperator_STRIDED_SLICE: + add_strided_slice_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_STRIDED_SLICE; + break; + case tflite::BuiltinOperator_SPACE_TO_BATCH_ND: + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_SPACE_TO_BATCH_ND; + break; + case tflite::BuiltinOperator_BATCH_TO_SPACE_ND: + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_BATCH_TO_SPACE_ND; + check_batch_to_space_params(); + break; + case tflite::BuiltinOperator_CAST: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_CAST_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_TOPK_V2: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_TOPK_V2_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_GATHER: + add_gather_ex_params(node.builtin_data); + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_GATHER_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_SPLIT: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_SPLIT_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_NEG: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_NEG_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_EXP: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_EXP_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_TRANSPOSE_CONV: + add_transpose_conv_params(node.builtin_data); + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_TRANSPOSE_CONV_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_PRELU: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_PRELU_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_ARG_MAX: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_ARGMAX_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; +#ifndef OBS_BUILD + case tflite::BuiltinOperator_PACK: + add_pack_ex_params(node.builtin_data); + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_PACK_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_UNPACK: + add_unpack_ex_params(node.builtin_data); + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_UNPACK_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_SQRT: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_SQRT_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_RSQRT: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_RSQRT_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_EQUAL: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_EQUAL_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_NOT_EQUAL: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_NOT_EQUAL_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_SUM: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_REDUCE_SUM_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_REDUCE_MAX: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_TENSORFLOW_MAX_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_REDUCE_MIN: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_REDUCE_MIN_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_LOGICAL_AND: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_LOGICAL_AND_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_LOGICAL_OR: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_LOGICAL_OR_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; +#endif + case tflite::BuiltinOperator_CONCAT_EMBEDDINGS: + case tflite::BuiltinOperator_LSH_PROJECTION: + case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN: + case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: + case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: + case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM: + case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: + //case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: + case tflite::BuiltinOperator_PADV2: + //case tflite::BuiltinOperator_RESIZE_BILINEAR: + case tflite::BuiltinOperator_CALL: + case tflite::BuiltinOperator_SKIP_GRAM: + //case tflite::BuiltinOperator_RELU_N1_TO_1: + //case tflite::BuiltinOperator_GATHER: + //case tflite::BuiltinOperator_SPACE_TO_BATCH_ND: + //case tflite::BuiltinOperator_BATCH_TO_SPACE_ND: + //case tflite::BuiltinOperator_TOPK_V2: + //case tflite::BuiltinOperator_SPLIT: + //case tflite::BuiltinOperator_STRIDED_SLICE: + //case tflite::BuiltinOperator_EXP: + case tflite::BuiltinOperator_LOG_SOFTMAX: + //case tflite::BuiltinOperator_DEQUANTIZE: + case tflite::BuiltinOperator_DELEGATE: + //case tflite::BuiltinOperator_CAST: + //case tflite::BuiltinOperator_PRELU: + case tflite::BuiltinOperator_MAXIMUM: + case tflite::BuiltinOperator_MINIMUM: +#ifndef OBS_BUILD + case tflite::BuiltinOperator_ARG_MIN: +#endif + case tflite::BuiltinOperator_GREATER: + case tflite::BuiltinOperator_GREATER_EQUAL: + case tflite::BuiltinOperator_LESS: + case tflite::BuiltinOperator_LESS_EQUAL: + //case tflite::BuiltinOperator_NEG: + case tflite::BuiltinOperator_SELECT: + case tflite::BuiltinOperator_SLICE: + case tflite::BuiltinOperator_SIN: + //case tflite::BuiltinOperator_LOG: + //case tflite::BuiltinOperator_TRANSPOSE_CONV: +#ifndef OBS_BUILD + case tflite::BuiltinOperator_TILE: + case tflite::BuiltinOperator_EXPAND_DIMS: + case tflite::BuiltinOperator_SPARSE_TO_DENSE: + //case tflite::BuiltinOperator_EQUAL: + //case tflite::BuiltinOperator_NOT_EQUAL: + //case tflite::BuiltinOperator_SUM: + //case tflite::BuiltinOperator_REDUCE_MAX: + //case tflite::BuiltinOperator_REDUCE_MIN: + case tflite::BuiltinOperator_REDUCE_PROD: + //case tflite::BuiltinOperator_SQRT: + //case tflite::BuiltinOperator_RSQRT: + case tflite::BuiltinOperator_SHAPE: + case tflite::BuiltinOperator_POW: + case tflite::BuiltinOperator_FAKE_QUANT: + //case tflite::BuiltinOperator_PACK: + //case tflite::BuiltinOperator_LOGICAL_OR: + case tflite::BuiltinOperator_ONE_HOT: + //case tflite::BuiltinOperator_LOGICAL_AND: + case tflite::BuiltinOperator_LOGICAL_NOT: + //case tflite::BuiltinOperator_UNPACK: + case tflite::BuiltinOperator_FLOOR_DIV: + case tflite::BuiltinOperator_REDUCE_ANY: + case tflite::BuiltinOperator_SQUARE: + case tflite::BuiltinOperator_ZEROS_LIKE: + case tflite::BuiltinOperator_FILL: +#endif + logError("Op code %d is currently not delegated to NNAPI", builtin); + return kTfLiteError; + break; + case tflite::BuiltinOperator_CUSTOM: { + std::string custom_name(registration.custom_name); + if (custom_name.compare("TensorFlowMax") == 0) { + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_TENSORFLOW_MAX_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + } + else if (custom_name.compare("SquaredDifference") == 0) { + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_SQUARED_DIFFERENCE_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + } + else if (custom_name.compare("TensorFlowSum") == 0) { + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_REDUCE_SUM_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + } + logError("Custom operations are not supported when using NNAPI."); + return kTfLiteError; + break; + } +#ifdef OBS_BUILD + default: + logError("Op code %d is currently not delegated to NNAPI", builtin); + return kTfLiteError; + break; +#endif + } + + //if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) { + // FATAL("Op %d needs NNAPI1.1", builtin); + //} + + // Add the operation. + RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation( + nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(augmented_outputs.size()), + reinterpret_cast<uint32_t*>(augmented_outputs.data()))); + } + return kTfLiteOk; +} + +TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) { + if (nn_model_ && nn_compiled_model_) return model_status_; + + // TODO(aselle): This is not correct. need to handle resize invalidation. + if (!nn_model_) { + CHECK_NN(ANeuralNetworksModel_create(&nn_model_)); + + // Find which tensors should be added to NNAPI. TFLite has temporaries + // and RNN back-edges which are are not valid for NNAPI. We look through all + // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with + // kOperandIdNotSet. addTensorOperands will replace those with the + // corresponding NNAPI operand ids and skip kOperandNotNeeded entries. + std::vector<int64_t> tensor_id_to_nnapi_id(interpreter->tensors_size(), + kOperandNotNeeded); + auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf, + size_t count) { + for (int j = 0; j < count; j++) { + auto tensor_id = buf[j]; + if (tensor_id != kOptionalTensor) { + tensor_id_to_nnapi_id[tensor_id] = kOperandIdNotSet; + } + } + }; + for (size_t i = 0; i < interpreter->nodes_size(); i++) { + const auto* node_and_registration = interpreter->node_and_registration(i); + const TfLiteNode& node = node_and_registration->first; + set_ids_to_not_set(node.inputs->data, node.inputs->size); + set_ids_to_not_set(node.outputs->data, node.outputs->size); + } + set_ids_to_not_set(interpreter->inputs().data(), + interpreter->inputs().size()); + set_ids_to_not_set(interpreter->outputs().data(), + interpreter->outputs().size()); + + uint32_t next_id = 0; + RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands( + interpreter, nn_model_, &next_id, &tensor_id_to_nnapi_id)); + RETURN_ERROR_IF_TFLITE_FAILED( + AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_, + &model_states_outputs_, tensor_id_to_nnapi_id)); + + std::vector<uint32_t> augmented_inputs; + MapAndAddTensorIds(interpreter->inputs().data(), + interpreter->inputs().size(), &augmented_inputs, + tensor_id_to_nnapi_id); + augmented_inputs.insert(augmented_inputs.end(), + model_states_inputs_.begin(), + model_states_inputs_.end()); + std::vector<uint32_t> augmented_outputs; + MapAndAddTensorIds(interpreter->outputs().data(), + interpreter->outputs().size(), &augmented_outputs, + tensor_id_to_nnapi_id); + MapAndAddTensorIds(model_states_outputs_.data(), + model_states_outputs_.size(), &augmented_outputs, + tensor_id_to_nnapi_id); + + CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs( + nn_model_, static_cast<uint32_t>(augmented_inputs.size()), + reinterpret_cast<const uint32_t*>(augmented_inputs.data()), + static_cast<uint32_t>(augmented_outputs.size()), + reinterpret_cast<const uint32_t*>(augmented_outputs.data()))); + + // TODO Support ANeuralNetworksModel_relaxComputationFloat32toFloat16 + //if (GetAndroidSdkVersionCached() >= 28) { + // CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16( + // nn_model_, interpreter->GetAllowFp16PrecisionForFp32())); + //} + CHECK_NN(ANeuralNetworksModel_finish(nn_model_)); + } + if (!nn_compiled_model_) { + CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_)); + CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_)); + } + return kTfLiteOk; +} + +#include <unordered_map> + +TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) { + if (!nn_model_) { + model_status_ = BuildGraph(interpreter); + if (model_status_ != kTfLiteOk) { + logError("Failed to build graph for NNAPI"); + } + } + if (model_status_ != kTfLiteOk) { + return model_status_; + } + + ANeuralNetworksExecution* execution = nullptr; + CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution)); + + // Allocate temporary buffer to save casted boolean tensor + std::unordered_map<size_t, uint8_t*> input_boolean_tensors; + std::unordered_map<size_t, uint8_t*> output_boolean_tensors; + for (size_t i = 0; i < interpreter->inputs().size(); i++) + { + int input = interpreter->inputs()[i]; + TfLiteTensor* tensor = interpreter->tensor(input); + if (tensor->type == kTfLiteBool) + { + size_t elements = tensor->bytes / sizeof(bool); + uint8_t* temp_tensor = new uint8_t[tensor->bytes / sizeof(bool)]; + input_boolean_tensors[i] = temp_tensor; + for (size_t idx = 0; idx < elements; idx++) + { + temp_tensor[idx] = (tensor->data.b[idx] ? 0x00 : 0xff); + } + } + } + for (size_t i = 0; i < interpreter->outputs().size(); i++) + { + int output = interpreter->outputs()[i]; + TfLiteTensor* tensor = interpreter->tensor(output); + if (tensor->type == kTfLiteBool) + { + uint8_t* temp_tensor = new uint8_t[tensor->bytes / sizeof(bool)]; + output_boolean_tensors[i] = temp_tensor; + } + } + + // Currently perform deep copy of input buffer + for (size_t i = 0; i < interpreter->inputs().size(); i++) { + int input = interpreter->inputs()[i]; + // TODO(aselle): Is this what we want or do we want input instead? + // TODO(aselle): This should be called setInputValue maybe to be cons. + TfLiteTensor* tensor = interpreter->tensor(input); + if (tensor->type == kTfLiteBool) + { + CHECK_NN(ANeuralNetworksExecution_setInput( + execution, i, nullptr, input_boolean_tensors[i], tensor->bytes * sizeof(uint8_t) / sizeof(bool))); + } + else + { + CHECK_NN(ANeuralNetworksExecution_setInput( + execution, i, nullptr, tensor->data.raw, tensor->bytes)); + } + } + + // Tell nn api where to place final data. + for (size_t i = 0; i < interpreter->outputs().size(); i++) { + int output = interpreter->outputs()[i]; + TfLiteTensor* tensor = interpreter->tensor(output); + + if (tensor->type == kTfLiteBool) + { + CHECK_NN(ANeuralNetworksExecution_setOutput( + execution, i, nullptr, output_boolean_tensors[i], tensor->bytes * sizeof(uint8_t) / sizeof(bool))); + } + else + { + CHECK_NN(ANeuralNetworksExecution_setOutput( + execution, i, nullptr, tensor->data.raw, tensor->bytes)); + } + } + + // The state_out of previous invocation need to be mapped to state_in of + // current invocation. + for (size_t i = 0; i < model_states_outputs_.size(); i++) { + int state_tensor_idx = model_states_outputs_[i]; + TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx); + // Here we are using a deep copy for state_in tensors so that we are not + // reading and writing into the same buffer during a invocation. + // TODO(miaowang): using double shared buffer to minimize the copies. + CHECK_NN(ANeuralNetworksExecution_setInput( + execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw, + tensor->bytes)); + // Tell NNAPI where to output the state_out. + CHECK_NN(ANeuralNetworksExecution_setOutput( + execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw, + tensor->bytes)); + } + + // Currently use blocking compute. + ANeuralNetworksEvent* event = nullptr; + CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event)); + CHECK_NN(ANeuralNetworksEvent_wait(event)); + ANeuralNetworksEvent_free(event); + ANeuralNetworksExecution_free(execution); + + // Tell nn api where to place final data. + for (size_t i = 0; i < interpreter->inputs().size(); i++) { + int input = interpreter->inputs()[i]; + TfLiteTensor* tensor = interpreter->tensor(input); + + if (tensor->type == kTfLiteBool) + { + uint8_t* temp_tensor = input_boolean_tensors[i]; + input_boolean_tensors[i] = nullptr; + delete temp_tensor; + } + } + for (size_t i = 0; i < interpreter->outputs().size(); i++) { + int output = interpreter->outputs()[i]; + TfLiteTensor* tensor = interpreter->tensor(output); + + if (tensor->type == kTfLiteBool) + { + uint8_t* temp_tensor = output_boolean_tensors[i]; + size_t elements = tensor->bytes / sizeof(bool); + for (size_t idx = 0; idx < elements; idx++) + { + tensor->data.b[idx] = ((temp_tensor[idx] == 0x00) ? false : true); + } + output_boolean_tensors[i] = nullptr; + delete temp_tensor; + } + } + +#if 0 + printf("From the NN API:\n"); + TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]); + if (float* data = + interpreter->typed_tensor<float>(interpreter->outputs()[0])) { + size_t num = tensor->bytes / sizeof(float); + for (float* p = data; p < data + num; p++) { + printf(" %f", *p); + } + printf("\n"); + } +#endif + + return kTfLiteOk; +} + +bool NNAPIDelegate::IsSupported() { return nnfw::NNAPIExists(); } + +} // namespace tflite +} // namespace nnfw + +// clang-format on diff --git a/libs/tflite/src/ext/nnapi_delegate_ex_AddOpsAndParams_lambda.inc b/libs/tflite/src/ext/nnapi_delegate_ex_AddOpsAndParams_lambda.inc new file mode 100644 index 000000000..a91e4de60 --- /dev/null +++ b/libs/tflite/src/ext/nnapi_delegate_ex_AddOpsAndParams_lambda.inc @@ -0,0 +1,106 @@ +// This file is included from AddOpsAndParams defined in nnapi_delegate.cc +// and contains lambda for extened implementation to original Tensorflow Lite. + auto add_resize_bilinear_params = [&add_scalar_int32, &interpreter, &augmented_inputs](void* data) { + auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(data); + if (builtin->align_corners) { + FATAL("Resize bilinear does not support align corners in NNAPI"); + } + + TfLiteTensor* tensor = interpreter->tensor(augmented_inputs.back()); + assert(tensor->type == kTfLiteInt32); + assert(tensor->bytes == sizeof(int)*2); + augmented_inputs.pop_back(); + + int height = ((int*)(tensor->data.raw))[1]; + int width = ((int*)(tensor->data.raw))[0]; + add_scalar_int32(height); + add_scalar_int32(width); + }; + + auto check_l2normalization_params = [interpreter, &node](void* data) { + auto builtin = reinterpret_cast<TfLiteL2NormParams*>(data); + if (builtin->activation != kTfLiteActNone) { + FATAL("NNAPI does not support L2Normalization with fused activations"); + } + if ((node.inputs->size > 0) && + (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) { + FATAL("NNAPI only supports input rank 4 for L2Normalization"); + } + }; + + auto add_transpose_conv_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteTransposeConvParams*>(data); + add_scalar_int32(builtin->padding); + add_scalar_int32(builtin->stride_width); + add_scalar_int32(builtin->stride_height); + }; + + auto add_lrn_params = [&add_scalar_int32, + &add_scalar_float32](void* data) { + auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(data); + add_scalar_int32(builtin->radius); + add_scalar_float32(builtin->bias); + add_scalar_float32(builtin->alpha); + add_scalar_float32(builtin->beta); + }; + + auto add_strided_slice_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(data); + add_scalar_int32(builtin->begin_mask); + add_scalar_int32(builtin->end_mask); + // ellipsis_mask and new_axis_mask are not supported on nn runtime + // cf) tflite interpreter supports both operations + if (builtin->ellipsis_mask) { + FATAL("STRIDE_SLICE does not support ellipsis_mask in NNAPI"); + } + if (builtin->new_axis_mask) { + FATAL("STRIDE_SLICE does not support new_axis_mask in NNAPI"); + } + add_scalar_int32(builtin->shrink_axis_mask); + }; + + auto add_gather_ex_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteGatherParams*>(data); + add_scalar_int32(builtin->axis); + if (builtin->axis != 0) { + FATAL("GATHER does not support axis>0 in NNAPI"); + } + }; + +#ifndef OBS_BUILD + auto add_pack_ex_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLitePackParams*>(data); + add_scalar_int32(builtin->values_count); + add_scalar_int32(builtin->axis); + }; + + auto add_unpack_ex_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteUnpackParams*>(data); + add_scalar_int32(builtin->num); + add_scalar_int32(builtin->axis); + }; +#endif + + auto check_batch_to_space_params = [interpreter, &node, &augmented_inputs]() { + + //If there are 3 inputs, check if crops is having default values {0, 0, 0, 0} + //Else unsupported by NNAPI + + if(augmented_inputs.size() == 3) + { + const uint32_t crops_buffer_index = node.inputs->data[2]; + const TfLiteTensor* crops = interpreter->tensor(crops_buffer_index); + const int *crops_value = crops->data.i32; + + //Check if crops is having default values {0, 0, 0, 0} + if(crops_value[0] != 0 || crops_value[1] != 0 || crops_value[2] != 0 || crops_value[3] != 0) + { + FATAL("BATCH_TO_SPACE_ND does not support Explicit crops in NNAPI"); + } + else + { + //Restrict crops input and pass only other two inputs + augmented_inputs.pop_back(); + } + } + }; diff --git a/libs/support/tflite/src/interp/FlatBufferBuilder.cpp b/libs/tflite/src/interp/FlatBufferBuilder.cpp index 67df13f34..4b9cde719 100644 --- a/libs/support/tflite/src/interp/FlatBufferBuilder.cpp +++ b/libs/tflite/src/interp/FlatBufferBuilder.cpp @@ -14,24 +14,20 @@ * limitations under the License. */ -#include "support/tflite/interp/FlatBufferBuilder.h" +#include "tflite/interp/FlatBufferBuilder.h" -#include "support/tflite/kernels/register.h" +#include "tflite/ext/kernels/register.h" namespace nnfw { -namespace support -{ namespace tflite { -namespace interp -{ std::unique_ptr<::tflite::Interpreter> FlatBufferBuilder::build(void) const { std::unique_ptr<::tflite::Interpreter> interpreter; - ::tflite::ops::builtin::BuiltinOpResolver resolver; + nnfw::tflite::BuiltinOpResolver resolver; ::tflite::InterpreterBuilder builder(_model, resolver); @@ -40,7 +36,5 @@ std::unique_ptr<::tflite::Interpreter> FlatBufferBuilder::build(void) const return std::move(interpreter); } -} // namespace interp } // namespace tflite -} // namespace support } // namespace nnfw diff --git a/libs/support/tflite/src/interp/FunctionBuilder.cpp b/libs/tflite/src/interp/FunctionBuilder.cpp index 65783bd37..eab940c18 100644 --- a/libs/support/tflite/src/interp/FunctionBuilder.cpp +++ b/libs/tflite/src/interp/FunctionBuilder.cpp @@ -14,16 +14,12 @@ * limitations under the License. */ -#include "support/tflite/interp/FunctionBuilder.h" +#include "tflite/interp/FunctionBuilder.h" namespace nnfw { -namespace support -{ namespace tflite { -namespace interp -{ std::unique_ptr<::tflite::Interpreter> FunctionBuilder::build(void) const { @@ -34,7 +30,5 @@ std::unique_ptr<::tflite::Interpreter> FunctionBuilder::build(void) const return std::move(res); } -} // namespace interp } // namespace tflite -} // namespace support } // namespace nnfw diff --git a/libs/util/CMakeLists.txt b/libs/util/CMakeLists.txt deleted file mode 100644 index eaa7ae8cf..000000000 --- a/libs/util/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Library `nnfw_util` -set(NNFW_UTILITY_SRCS src/environment.cpp) -list(APPEND NNFW_UTILITY_SRCS src/tensor/Shape.cpp) -list(APPEND NNFW_UTILITY_SRCS src/tensor/NonIncreasingStride.cpp) -list(APPEND NNFW_UTILITY_SRCS src/tensor/IndexFormatter.cpp) -list(APPEND NNFW_UTILITY_SRCS src/tensor/Comparator.cpp) -if(BUILD_TFLITE_BENCHMARK_MODEL) - list(APPEND NNFW_UTILITY_SRCS src/profiling/time.cc) -endif() - -add_library(nnfw_util SHARED ${NNFW_UTILITY_SRCS}) -target_include_directories(nnfw_util PUBLIC ${NNFW_INCLUDE_DIR}) - -add_library(static_nnfw_util STATIC ${NNFW_UTILITY_SRCS}) -target_include_directories(static_nnfw_util PUBLIC ${NNFW_INCLUDE_DIR}) -set_target_properties(static_nnfw_util PROPERTIES POSITION_INDEPENDENT_CODE ON) - -install(TARGETS nnfw_util - RUNTIME DESTINATION bin COMPONENT libraries - LIBRARY DESTINATION lib COMPONENT libraries) - -add_executable(nnfw_util_tensor_index_iterator "examples/tensor_index_iterator.cpp") -target_link_libraries(nnfw_util_tensor_index_iterator nnfw_util) diff --git a/libs/util/src/profiling/time.cc b/libs/util/src/profiling/time.cc deleted file mode 100644 index 6fe1b54dc..000000000 --- a/libs/util/src/profiling/time.cc +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "util/profiling/time.h" - -#include <sys/time.h> - -namespace tflite -{ -namespace profiling -{ -namespace time -{ -uint64_t NowMicros() -{ - struct timeval tv; - gettimeofday(&tv, nullptr); - return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; -} -} // namespace time -} // namespace profiling -} // namespace tflite |