diff options
Diffstat (limited to 'compute/ARMComputeEx')
186 files changed, 23807 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt new file mode 100644 index 000000000..aaebff758 --- /dev/null +++ b/compute/ARMComputeEx/CMakeLists.txt @@ -0,0 +1,32 @@ +nnas_find_package(ARMCompute QUIET) + +if(NOT ARMCompute_FOUND) + message(STATUS "Check ARM Compute library extension build: need ARM Compute library") + return() +else(NOT ARMCompute_FOUND) + message(STATUS "Check ARM Compute library extension build: OK") +endif(NOT ARMCompute_FOUND) + +set(ACL_EX_BASE ${CMAKE_CURRENT_SOURCE_DIR}) + +file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp") + +# generate embeded cl_kernel +execute_process ( + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + COMMAND bash -c "python resolve_includes.py" +) + +add_library(arm_compute_ex SHARED ${ACL_EX_SRCS}) +target_include_directories(arm_compute_ex PUBLIC ${ACL_EX_BASE}) +target_link_libraries(arm_compute_ex PRIVATE arm_compute) +target_link_libraries(arm_compute_ex PRIVATE nnfw_common) +target_link_libraries(arm_compute_ex PRIVATE nnfw_coverage) +# Defines to enable validate check in debug build +target_compile_definitions(arm_compute_ex PRIVATE EMBEDDED_KERNELS + $<$<CONFIG:Debug>:ARM_COMPUTE_DEBUG_ENABLED ARM_COMPUTE_ASSERTS_ENABLED + ARM_COMPUTE_LOGGING_ENABLED>) +# Validate check functions are not used on release build +# Some parameter are used for validate check function call, and these parameter may not used on release build +target_compile_options(arm_compute_ex PRIVATE $<$<NOT:$<CONFIG:Debug>>:-Wno-unused-parameter -Wno-unused-function>) +install(TARGETS arm_compute_ex DESTINATION lib) diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h new file mode 100644 index 000000000..e4e752ef9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLKernelLibraryEx.h + * @ingroup COM_AI_RUNTIME + * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines + * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL. + */ + +#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ +#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ + +#include "arm_compute/core/CL/OpenCL.h" + +#include <map> +#include <set> +#include <string> +#include <utility> + +namespace arm_compute +{ + +/** + * @brief Class to build OpenCL kernels added from nnfw + * */ +class CLKernelLibraryEx +{ + using StringSet = std::set<std::string>; + +private: + /** + * @brief Construct a new CLKernelLibraryEx object + */ + CLKernelLibraryEx(); + +public: + /** + * @brief Prevent instances of this class from being copied. + */ + CLKernelLibraryEx(const CLKernelLibraryEx &) = delete; + + /** + * @brief Prevent instances of this class from being copied. + */ + const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete; + + /** + * @brief Get the KernelLibrary singleton. + * @return The KernelLibrary instance + */ + static CLKernelLibraryEx &get(); + + /** + * @brief Initialise the kernel library. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @param[in] context CL context used to create programs. + * @param[in] device CL device for which the programs are created. + * @return N/A + */ + void init(std::string kernel_path, cl::Context context, cl::Device device) + { + _kernel_path = std::move(kernel_path); + _context = std::move(context); + _device = std::move(device); + } + + /** + * @brief Set the path that the kernels reside in. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @return N/A + */ + void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; }; + + /** + * @brief Get the path that the kernels reside in. + * @return the path of kernel files + */ + std::string get_kernel_path() { return _kernel_path; }; + + /** + * @brief Get the source of the selected program. + * @param[in] program_name Program name. + * @return Source of the selected program. + */ + std::string get_program_source(const std::string &program_name); + + /** + * @brief Set the CL context used to create programs. + * @note Setting the context also resets the device to the + * first one available in the new context. + * @param[in] context A CL context. + * @return N/A + */ + void set_context(cl::Context context) + { + _context = std::move(context); + if (_context.get() == nullptr) + { + _device = cl::Device(); + } + else + { + const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>(); + + if (cl_devices.empty()) + { + _device = cl::Device(); + } + else + { + _device = cl_devices[0]; + } + } + } + + /** + * @brief Return associated CL context. + * @return A CL context. + */ + cl::Context &context() { return _context; } + + /** + * @brief Set the CL device for which the programs are created. + * @param[in] device A CL device. + * @return N/A + */ + void set_device(cl::Device device) { _device = std::move(device); } + + /** + * @brief Gets the CL device for which the programs are created. + * @return A CL device. + */ + cl::Device &get_device() { return _device; } + + /** + * @brief Return the device version + * @return The content of CL_DEVICE_VERSION + */ + std::string get_device_version(); + + /** + * @brief Create a kernel from the kernel library. + * @param[in] kernel_name Kernel name. + * @param[in] build_options_set Kernel build options as a set. + * @return The created kernel. + */ + Kernel create_kernel(const std::string &kernel_name, + const StringSet &build_options_set = {}) const; + + /** + * @brief Find the maximum number of local work items in a workgroup can be supported for the + * kernel. + * @param[in] kernel kernel object + */ + + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; + /** + * @brief Return the default NDRange for the device. + * @return default NDRangeof the device + */ + cl::NDRange default_ndrange() const; + + /** + * @brief Clear the library's cache of binary programs + * @return N/A + */ + void clear_programs_cache() + { + _programs_map.clear(); + _built_programs_map.clear(); + } + + /** + * @brief Access the cache of built OpenCL programs + * @return program map data structure of which key is name of kernel and value is + * kerel source name. (*.cl) + */ + const std::map<std::string, cl::Program> &get_built_programs() const + { + return _built_programs_map; + } + + /** + * @brief Add a new built program to the cache + * @param[in] built_program_name Name of the program + * @param[in] program Built program to add to the cache + * @return N/A + */ + void add_built_program(const std::string &built_program_name, cl::Program program); + + /** + * @brief Returns true if FP16 is supported by the CL device + * @return true if the CL device supports FP16 + */ + bool fp16_supported() const; + + /** + * @brief Returns true if int64_base_atomics extension is supported by the CL device + * @return true if the CL device supports int64_base_atomics extension + */ + bool int64_base_atomics_supported() const; + +private: + /** + * @brief Load program and its dependencies. + * @param[in] program_name Name of the program to load. + */ + const Program &load_program(const std::string &program_name) const; + /** + * @brief Concatenates contents of a set into a single string. + * @param[in] s Input set to concatenate. + * @return Concatenated string. + */ + std::string stringify_set(const StringSet &s) const; + + cl::Context _context; /**< Underlying CL context. */ + cl::Device _device; /**< Underlying CL device. */ + std::string _kernel_path; /**< Path to the kernels folder. */ + mutable std::map<std::string, const Program> + _programs_map; /**< Map with all already loaded program data. */ + mutable std::map<std::string, cl::Program> + _built_programs_map; /**< Map with all already built program data. */ + static const std::map<std::string, std::string> + _kernel_program_map; /**< Map that associates kernel names with programs. */ + static const std::map<std::string, std::string> + _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ +}; +} +#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h new file mode 100644 index 000000000..b98b174f7 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLArgOperationKernel.h + * @brief This file defines CLArgOperationKernel + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the argop kernel. + */ +class CLArgOperationKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor. + */ + CLArgOperationKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied + */ + CLArgOperationKernel(const CLArgOperationKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied + * @return Reference of this instance + */ + CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved + */ + CLArgOperationKernel(CLArgOperationKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved + * @return Reference of this instance + */ + CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default; + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The output tensor, Data types supported: S32. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Arg operation to perform. + * return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op); + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLArgOperationKernel + * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] output The output tensor info, Data types supported: S32. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Arg operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ArgOperation op); + + /* + * @brief Run CLArgOperationKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h new file mode 100644 index 000000000..ab33d9d3a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/ +class CLBinaryLogicalOpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLBinaryLogicalOpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h new file mode 100644 index 000000000..16cef0b61 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLCastKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLCastKernel class + */ + +#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ +#define __ARM_COMPUTE_CLCASTKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define OpenCL kernel for cast operation + */ +class CLCastKernel : public ICLKernel +{ +public: + /** + * @brief Construct CLCastKernel object + */ + CLCastKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLCastKernel(const CLCastKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLCastKernel &operator=(const CLCastKernel &) = delete; + + /** + * @brief Construct CLCastKernel object using default move constructor + * @param[in] CLCastKernel object to move + */ + CLCastKernel(CLCastKernel &&) = default; + + /** + * @brief Allow instances of this class to be moved + * @param[in] CLCastKernel object to move + */ + CLCastKernel &operator=(CLCastKernel &&) = default; + + /** + * @brief Destruct this CLCastKernel object + */ + ~CLCastKernel() = default; + + /** + * @brief Initialise the kernel's input and output. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] input_subtype Sub data type of input. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h new file mode 100644 index 000000000..60ec7a82a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ +#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform depthTospace operation */ +class CLDepthToSpaceKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLDepthToSpaceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete; + /** Allow instances of this class to be moved */ + CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default; + /** Allow instances of this class to be moved */ + CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default; + /** Default destructor */ + ~CLDepthToSpaceKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h new file mode 100644 index 000000000..da075db69 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLEmbeddingLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLEmbeddingLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform EmbeddingLookup operation with opencl kernel +*/ +class CLEmbeddingLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLEmbeddingLookupKernel object + * */ + CLEmbeddingLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLEmbeddingLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] input Source tensor. + * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] lookups Lookups are 1D tensor that values are indices into the first + * dimension of input. + * Data types supported: S32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLEmbeddingLookupKernel + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] lookups Lookups info. Data types supported: S32. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + const ICLTensor *_lookups; /** Lookups tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h new file mode 100644 index 000000000..aa81a1efa --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLGatherExKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLGatherExKernel class + */ + +#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__ +#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define an interface for the gather kernel. + */ +class CLGatherExKernel : public ICLKernel +{ +public: + /** + * @brief Construct CLGatherExKernel object + * */ + CLGatherExKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ + CLGatherExKernel(const CLGatherExKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ + CLGatherExKernel &operator=(const CLGatherExKernel &) = delete; + + /** + * @brief Construct CLGatherExKernel object by using default move constructor + * @param[in] CLGatherExKernel object to move + */ + CLGatherExKernel(CLGatherExKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLGatherExKernel object to move + */ + CLGatherExKernel &operator=(CLGatherExKernel &&) = default; + + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices Indices tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative + * values wrap around. Defaults to 0 + * @return N/A + */ + void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLGatherExKernel + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices Indices tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative + * values wrap around. Defaults to 0 + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis = 0); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_indices; + ICLTensor *_output; + int _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLGATHEREXKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h new file mode 100644 index 000000000..8269e5a7a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLHashtableLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLHashtableLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform HashtableLookup operation with opencl kernel +*/ +class CLHashtableLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLHashtableLookupKernel object + * */ + CLHashtableLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Construct a CLHashtableLookupKernel object by using default move constructor + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLHashtableLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, + ICLTensor *output, ICLTensor *hits); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLHashtableLookupKernel + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_lookups{nullptr}; /** Lookups tensor */ + const ICLTensor *_keys{nullptr}; /** Keys tensor */ + const ICLTensor *_input{nullptr}; /** Source tensor */ + ICLTensor *_output{nullptr}; /** Destination tensor */ + ICLTensor *_hits{nullptr}; /** Hits tensor */ + std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h new file mode 100644 index 000000000..f5e147e03 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for performing an instance normalization */ +class CLInstanceNormalizationLayerKernelEx : public ICLKernel +{ +public: + /** Constructor */ + CLInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx & + operator=(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Default Move Constructor. */ + CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default move assignment operator */ + CLInstanceNormalizationLayerKernelEx & + operator=(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~CLInstanceNormalizationLayerKernelEx() = default; + + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; + ICLTensor *_gamma; + ICLTensor *_beta; + float _epsilon; + bool _run_in_place; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h new file mode 100644 index 000000000..ccbea147e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ +#define __ARM_COMPUTE_CLNEGKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform a negation operation on tensor*/ +class CLNegKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLNegKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel(const CLNegKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel &operator=(const CLNegKernel &) = delete; + /** Allow instances of this class to be moved */ + CLNegKernel(CLNegKernel &&) = default; + /** Allow instances of this class to be moved */ + CLNegKernel &operator=(CLNegKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h new file mode 100644 index 000000000..eff1b8bd5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__ +#define __ARM_COMPUTE_CLPRELU_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to calculate PReLU*/ +class CLPReLUKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLPReLUKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPReLUKernel(const CLPReLUKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPReLUKernel &operator=(const CLPReLUKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPReLUKernel(CLPReLUKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPReLUKernel &operator=(CLPReLUKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor1. + * @param[in] alpha Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input; + const ICLTensor *_alpha; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h new file mode 100644 index 000000000..a26a4a7fc --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLReduceOperationKernel.h + * @brief This file defines CLReduceOperationKernel class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the reduce operation kernel + */ +class CLReduceOperationKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor + */ + CLReduceOperationKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel(const CLReduceOperationKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel(CLReduceOperationKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default; + /** + * @brief Default destructor + */ + ~CLReduceOperationKernel() = default; + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ReduceOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperationKernel. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32. + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op); + + /* + * @brief Run CLReduceOperationKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue CLQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h new file mode 100644 index 000000000..577e38cc4 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ +#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */ +class CLSpaceToBatchNDKernel final : public ICLKernel +{ +public: + /** Default constructor */ + CLSpaceToBatchNDKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default; + /** Default destructor */ + ~CLSpaceToBatchNDKernel() = default; + /** Initialise the kernel's input and output. + * + * @note The data layout of input and output must be the same. + * @note The number of dimensions of input and output must be 4, and `spatial` dimensions + * are height and width. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + * @param[in] block_size Block size tensor. Data types supported: S32. + * @param[in] padding_size Padding size tensor. Data types supported: S32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + */ + void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, + ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input{nullptr}; /**< Source tensor */ + const ICLTensor *_block_size{nullptr}; /**< Block size tensor */ + const ICLTensor *_padding_size{nullptr}; /**< Padding size tensor */ + ICLTensor *_output{nullptr}; /**< Destination tensor */ +}; + +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h new file mode 100644 index 000000000..be845a549 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ +#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform spaceTodepth operation */ +class CLSpaceToDepthKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLSpaceToDepthKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default; + /** Default destructor */ + ~CLSpaceToDepthKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h new file mode 100644 index 000000000..8da2daecc --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLTopKV2Kernel.h + * @brief This file defines classes for TopKV2Kernel + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ +#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +// these parameters can be changed +#define _ITEMS 16 // number of items in a group +#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS +#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram +#define PERMUT // store the final permutation +//////////////////////////////////////////////////////// + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define CLTopKV2Single + */ +class CLTopKV2Single : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Single(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + */ + CLTopKV2Single(const CLTopKV2Single &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + * @return Reference of this instance + */ + CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + */ + CLTopKV2Single(CLTopKV2Single &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + * @return Reference of this instance + */ + CLTopKV2Single &operator=(CLTopKV2Single &&) = default; + + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] topk_values Values of the top k predictions + * @param[in] topk_indices Indices of the top k predictions + * @param[in] indices Indices + * @param[in] temp_stack Temp stack + * @param[in] k K of the top k predictions + * @param[in] n Number times to quick-sort + * return N/A + */ + void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n); + + /* + * @brief Run CLTopKV2Single op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_topk_values; + ICLTensor *_topk_indices; +}; + +/** + * @brief Class to define CLTopKV2Init + */ +class CLTopKV2Init : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Init(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + */ + CLTopKV2Init(const CLTopKV2Init &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + * @return Reference of this instance + */ + CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + */ + CLTopKV2Init(CLTopKV2Init &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + * @return Reference of this instance + */ + CLTopKV2Init &operator=(CLTopKV2Init &&) = default; + + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] in_key_buf Buffer of input key + * @param[in] in_ind_buf Buffer of input index + * @param[in] n Number times to quick-sort + * return N/A + */ + void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n); + + /* + * @brief Run CLTopKV2Init op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; +}; + +/** + * @brief Class to define CLRadixSortHistogram + */ +class CLRadixSortHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + */ + CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + */ + CLRadixSortHistogram(CLRadixSortHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, int bits, int n); + + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * return N/A + */ + void setPass(int pass, cl::Buffer *in_key_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + } + + /* + * @brief Run CLRadixSortHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; +}; + +/** + * @brief Class to define CLRadixSortScanHistogram + */ +class CLRadixSortScanHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortScanHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + */ + CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + */ + CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + /* + * @brief Run CLRadixSortScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortGlobalScanHistogram + */ +class CLRadixSortGlobalScanHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortGlobalScanHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + */ + CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + */ + CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] glob_sum_buf Buffer of global sum + * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits); + + /* + * @brief Run CLRadixSortGlobalScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortPasteHistogram + */ +class CLRadixSortPasteHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortPasteHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + */ + CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + */ + CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + /* + * @brief Run CLRadixSortPasteHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortReorder + */ +class CLRadixSortReorder : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortReorder(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + */ + CLRadixSortReorder(const CLRadixSortReorder &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + * @return Reference of this instance + */ + CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + */ + CLRadixSortReorder(CLRadixSortReorder &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + * @return Reference of this instance + */ + CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, int bits, int n); + + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + /* + * @brief Run CLRadixSortReorder op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +/** + * @brief Class to define CLTopKV2FindFirstNegative + */ +class CLTopKV2FindFirstNegative : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2FindFirstNegative(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + */ + CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + * @return Reference of this instance + */ + CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + */ + CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + * @return Reference of this instance + */ + CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ + void configure(cl::Buffer *first_negative_idx_buf, int n); + + /** + * @brief Set output buffer + * @param[out] out_key_buf Buffer of output key + * return N/A + */ + void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; } + + /* + * @brief Run CLTopKV2FindFirstNegative op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_out_key_buf; +}; + +/** + * @brief Class to define CLTopKV2ReorderNegatives + */ +class CLTopKV2ReorderNegatives : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2ReorderNegatives(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + */ + CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + * @return Reference of this instance + */ + CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + */ + CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + * @return Reference of this instance + */ + CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ + void configure(cl::Buffer *first_negative_idx_buf, int n); + + /** + * @brief Set buffers + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + + /* + * @brief Run CLTopKV2ReorderNegatives op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +/** + * @brief Class to define CLTopKV2Store + */ +class CLTopKV2Store : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Store(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + */ + CLTopKV2Store(const CLTopKV2Store &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + * @return Reference of this instance + */ + CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + */ + CLTopKV2Store(CLTopKV2Store &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + * @return Reference of this instance + */ + CLTopKV2Store &operator=(CLTopKV2Store &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] values Values tensor to store + * @param[out] indices Indices tensor to be used for store + * @param[in] k K of the top k predictions + * @param[in] n Number times to store + * return N/A + */ + void configure(ICLTensor *values, ICLTensor *indices, int k, int n); + + /** + * @brief Set buffers + * @param[out] out_key_buf Buffer of output key + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); + + /* + * @brief Run CLTopKV2Store op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_values; + ICLTensor *_indices; + cl::Buffer *_out_key_buf; + cl::Buffer *_out_ind_buf; +}; + +} // namespace arm_compute +#endif // Disable GPU implementation +#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h new file mode 100644 index 000000000..c5ef730b6 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ +#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL. + */ +class CLTransposeConvLayerUpsampleKernel : public ICLKernel +{ +public: + /** Constructor */ + CLTransposeConvLayerUpsampleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayerUpsampleKernel & + operator=(const CLTransposeConvLayerUpsampleKernel &) = delete; + /** Default Move Constructor. */ + CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default; + /** Default move assignment operator */ + CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default; + /** Default destructor */ + ~CLTransposeConvLayerUpsampleKernel() = default; + + /** Initialise the kernel's input and output. + * + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data types supported: same as @p input. All but + * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only + * performed within the XY-plane. + * @param[in] inner_border Top and right inner border sizes. These rows and columns will be + * filled with zero. + * @param[in] info Contains padding and stride information described in @ref + * PadStrideInfo. + */ + void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, + const PadStrideInfo &info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLTransposeConvLayerUpsample + * + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data types supported: same as @p input. All + * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is + * only performed within the XY-plane. + * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled + * with zero. + * @param[in] info Contains padding and stride information described in @ref + * PadStrideInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const BorderSize &inner_border, const PadStrideInfo &info); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + BorderSize _inner_border; + PadStrideInfo _info; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h new file mode 100644 index 000000000..d093c22cb --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ +#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ + +#include "arm_compute/core/CPP/ICPPKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** CPP kernel to perform tensor upsample. + * + */ +class CPPUpsampleKernelEx : public ICPPKernel +{ +public: + const char *name() const override { return "CPPUpsampleKernelEx"; } + /** Default constructor */ + CPPUpsampleKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete; + /** Allow instances of this class to be moved */ + CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default; + /** Allow instances of this class to be moved */ + CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default; + /** Default destructor */ + ~CPPUpsampleKernelEx() = default; + + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8 + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] info Padding info. + */ + void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + bool is_parallelisable() const override; + +private: + const ITensor *_input; + ITensor *_output; + PadStrideInfo _info; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h new file mode 100644 index 000000000..358e0ebc6 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ +#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ + +#include <arm_neon.h> + +namespace arm_compute +{ +class ITensor; +class Window; +class QuantizationInfo; +} // namespace arm_compute + +namespace arm_compute +{ + +float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, + const float32x4_t &scale); + +void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, + const float32x4_t &invscale); + +float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale); + +void elementwise_op_quantized( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo), + int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, + float32x4_t, float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, + int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)); + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)); + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)); +} // namespace arm_compute +#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h new file mode 100644 index 000000000..61992bd50 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ + +class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel +{ +public: + /** Default destructor */ + ~NEBinaryLogicalOperationKernel() = default; + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] op Binary logical operation to be executed. + * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[in] output Output tensor. Data types supported: Same as @p input1. + */ + void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] op Binary logical operation to be executed. + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * + * @return a Status + */ + static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1, + const ITensorInfo *input2, const ITensorInfo *output); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, + const ITensorInfo &output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h new file mode 100644 index 000000000..fd2a2ee3b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECASTKERNEL_H__ +#define __ARM_COMPUTE_NECASTKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the cast layer kernel. */ +class NECastKernel : public INEKernel +{ +public: + const char *name() const override { return "NECastKernel"; } + /** Default constructor */ + NECastKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastKernel(const NECastKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastKernel &operator=(const NECastKernel &) = delete; + /** Default Move Constructor. */ + NECastKernel(NECastKernel &&) = default; + /** Default move assignment operator */ + NECastKernel &operator=(NECastKernel &&) = default; + /** Default destructor */ + ~NECastKernel() = default; + /** Set input, output tensors. + * + * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: + * U8/S8/QASYMM8/U32/S32/F32. + * @param[in] input_subtype Sub data type of input. + */ + void configure(const ITensor *input, ITensor *output, SubDataType input_subtype); + /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[in] input_subtype Sub data type of input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; + SubDataType _input_subtype; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h new file mode 100644 index 000000000..5b6ef6bfb --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ +#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the depth to space kernel */ +class NEDepthToSpaceLayerKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; } + /** Default constructor */ + NEDepthToSpaceLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default; + /** Default destructor */ + ~NEDepthToSpaceLayerKernelEx() = default; + /** Initialise the kernel's inputs and output. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape x value. + */ + void configure(const ITensor *input, ITensor *output, int32_t block_shape); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEDepthToSpaceLayerKernelEx. + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Tensor output info. Data types supported: same as @p input + * @param[in] block_shape Block shape value. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ + int32_t _block_shape; /**< Block shape */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h new file mode 100644 index 000000000..d6fad1155 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ +#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for an element-wise unary operation kernel + * + * Element-wise operation is computed by: + * @f[ output(x) = OP(input(x))@f] + * + */ +class NEElementwiseUnaryKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEElementwiseUnaryKernelEx"; } + /** Default constructor */ + NEElementwiseUnaryKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default; + /** Default destructor */ + ~NEElementwiseUnaryKernelEx() = default; + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEElementwiseUnaryKernelEx + * + * @param[in] op Arithmetic operation to be executed. + * @param[in] input First tensor input. Data types supported: F16/F32/S32. + * @param[in] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEElementwiseUnaryKernelEx + * + * @param[in] op Arithmetic operation to be executed. + * @param[in] input First tensor input info. Data types supported: F16/F32/S32. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * + * @return a Status + */ + static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input, + const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + + /** Common signature for all the specialised arithmetic functions + * + * @param[in] input An input tensor. Data types supported: F16/F32/S32. + * @param[out] output The output tensor. Data types supported: Same as @p input. + * @param[in] window Region on which to execute the kernel. + */ + using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output, + const Window &window); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output); + + /** Function to use for the particular tensor types passed to configure() */ + std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function; + + const ITensor *_input; + ITensor *_output; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h new file mode 100644 index 000000000..1490e75f2 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform EmbeddingLookup operation */ +class NEEmbeddingLookupKernel : public INEKernel +{ +public: + const char *name() const override { return "NEEmbeddingLookupKernel"; } + /** Default constructor */ + NEEmbeddingLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEEmbeddingLookupKernel(const NEEmbeddingLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEEmbeddingLookupKernel &operator=(const NEEmbeddingLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NEEmbeddingLookupKernel(NEEmbeddingLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NEEmbeddingLookupKernel &operator=(NEEmbeddingLookupKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Destination tensor. Data types supported: same as @p input. + * @param[in] lookups Lookups are 1D tensor that values are indices into the first dimension of + * input. + */ + void configure(const ITensor *input, ITensor *output, const ITensor *lookups); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEEmbeddingLookupKernel + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Destination tensor. Data types supported: same as @p input. + * @param[in] lookups Lookups info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + const ITensor *_lookups; + ITensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h new file mode 100644 index 000000000..3fa9c6e9a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__ +#define __ARM_COMPUTE_NEGATHERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Kernel to perform other operation on NEON */ +class NEGatherKernelEx : public INEKernel +{ +public: + /** Default constructor. */ + NEGatherKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEGatherKernelEx(const NEGatherKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete; + /** Allow instances of this class to be moved. */ + NEGatherKernelEx(NEGatherKernelEx &&) = default; + /** Allow instances of this class to be moved. */ + NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default; + /** Default detructor */ + ~NEGatherKernelEx() = default; + + /** Name of the kernel + * + * @return Kernel name + */ + const char *name() const override { return "NEGatherKernelEx"; } + /** Initialise the kernel's inputs and outputs + * + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values + * wrap around. Defaults to 0 + */ + void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGatherKernelEx + * + * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] output Destination tensor info. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values + * wrap around. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Implementation of the gather operation for 0 axis. + * + * For gather on the 0 axis an element by element copy is performed. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info); + + /** Implementation of the gather operation. + * + * For 1<=axis a row-wise copy is taking place. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info); + + using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info); + + const ITensor *_input; + const ITensor *_indices; + int _axis; + ITensor *_output; + kernel_ptr _func; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h new file mode 100644 index 000000000..d8976e7d0 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform HashtableLookup operation */ +class NEHashtableLookupKernel : public INEKernel +{ +public: + const char *name() const override { return "NEHashtableLookupKernel"; } + /** Default constructor */ + NEHashtableLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEHashtableLookupKernel(const NEHashtableLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEHashtableLookupKernel &operator=(const NEHashtableLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NEHashtableLookupKernel(NEHashtableLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NEHashtableLookupKernel &operator=(NEHashtableLookupKernel &&) = default; + /** Initialize the kernel's inputs, outputs. + * + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32 + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * input. + */ + void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, + ITensor *hits); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEHashtableLookupKernel + * + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits (True) or not (False). Data types supported: U8/QASYMM8 + * + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_lookups; /** Lookups tensor */ + const ITensor *_keys; /** Keys tensor */ + const ITensor *_input; /** Source tensor */ + ITensor *_output; /** Destination tensor */ + ITensor *_hits; /** Hits tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h new file mode 100644 index 000000000..76e2587af --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for performing an instance normalization */ +class NEInstanceNormalizationLayerKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEInstanceNormalizationLayerKernelEx"; } + /** Default constructor */ + NEInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEInstanceNormalizationLayerKernelEx(const NEInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEInstanceNormalizationLayerKernelEx & + operator=(const NEInstanceNormalizationLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEInstanceNormalizationLayerKernelEx(NEInstanceNormalizationLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEInstanceNormalizationLayerKernelEx & + operator=(NEInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~NEInstanceNormalizationLayerKernelEx() = default; + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * In case of @p output tensor = nullptr this tensor will store the result + * of the normalization. + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. + * Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ITensor *input, ITensor *output, ITensor *gamma = nullptr, ITensor *beta = nullptr, + float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEInstanceNormalizationLayer. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults + * to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Common signature for all the specialized instance normalization functions + * + * @param[in, out] input An input tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * @param[out] output The output tensor. + * @param[in] gamma The scale scalar value applied to the normalized tensor. Defaults to + * 1.0 + * @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to + * 0.0 + * @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12 + */ + using NormalizationFunction = void(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window); + + NormalizationFunction *_func; + ITensor *_input; + ITensor *_output; + ITensor *_gamma; + ITensor *_beta; + float _epsilon; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h new file mode 100644 index 000000000..723b14523 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ +#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface to multiply scale factor kernel. */ +class NEMultiplyScaleFactorKernel : public INEKernel +{ +public: + const char *name() const override { return "NEMultiplyScaleFactorKernel"; } + /** Default constructor */ + NEMultiplyScaleFactorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMultiplyScaleFactorKernel(const NEMultiplyScaleFactorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMultiplyScaleFactorKernel &operator=(const NEMultiplyScaleFactorKernel &) = delete; + /** Default Move Constructor. */ + NEMultiplyScaleFactorKernel(NEMultiplyScaleFactorKernel &&) = default; + /** Default move assignment operator */ + NEMultiplyScaleFactorKernel &operator=(NEMultiplyScaleFactorKernel &&) = default; + /** Default destructor */ + ~NEMultiplyScaleFactorKernel() = default; + /** Set input, output tensors. + * + * @param[in/out] input Source tensor. Data type supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p scale_factor. + */ + void configure(const ITensor *input, const ITensor *scale_factor, ITensor *output, + float multiplier = 1.f); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEMultiplyScaleFactorKernel + * + * @param[in] input Input tensor info. Data types supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier = 1.f); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + template <typename T> void multiply(const Window &window); + +private: + const ITensor *_input; + const ITensor *_scale_factor; + ITensor *_output; + float _multiplier; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h new file mode 100644 index 000000000..79bb78661 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__ +#define __ARM_COMPUTE_NEPRELUKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform Parametric Rectified Linear Unit + * + * Result is computed by: + * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f] + */ +class NEPReLUKernel : public INEKernel +{ +public: + const char *name() const override { return "NEPReLUKernel"; } + /** Default constructor */ + NEPReLUKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPReLUKernel(const NEPReLUKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPReLUKernel &operator=(const NEPReLUKernel &) = delete; + /** Allow instances of this class to be moved */ + NEPReLUKernel(NEPReLUKernel &&) = default; + /** Allow instances of this class to be moved */ + NEPReLUKernel &operator=(NEPReLUKernel &&) = default; + /** Initialise the kernel's inputs and output + * + * @param[in] input Input tensor. Data type supported: QASYMM8/F32 + * @param[in] alpha Alpha tensor. Data types supported: Same as @p input + * @param[out] output Output tensor. Data types supported: Same as @p input + */ + void configure(const ITensor *input, const ITensor *alpha, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEPReLUKernel.h + * + * @param[in] input Input tensor input info. Data types supported: QASYMM8/F32. + * @param[in] alpha Alpha tensor input info. Data types supported: Same as @p input. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * + * @return a Status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, + const ITensorInfo *output); + static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, + const ITensorInfo &output); + +private: + const ITensor *_input; /**< Source tensor */ + const ITensor *_alpha; /**< Alpha tensor */ + ITensor *_output; /**< Destination tensor */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h new file mode 100644 index 000000000..590b23873 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ +#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the dequantization layer kernel. */ +class NEQuantizationSymmetricKernel : public INEKernel +{ +public: + const char *name() const override { return "NEQuantizationSymmetricKernel"; } + /** Default constructor */ + NEQuantizationSymmetricKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationSymmetricKernel(const NEQuantizationSymmetricKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationSymmetricKernel &operator=(const NEQuantizationSymmetricKernel &) = delete; + /** Default Move Constructor. */ + NEQuantizationSymmetricKernel(NEQuantizationSymmetricKernel &&) = default; + /** Default move assignment operator */ + NEQuantizationSymmetricKernel &operator=(NEQuantizationSymmetricKernel &&) = default; + /** Default destructor */ + ~NEQuantizationSymmetricKernel() = default; + /** Set input, output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: + * S8. + * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + */ + void configure(const ITensor *input, ITensor *output, ITensor *scale_factor); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEQuantizationSymmetricKernel + * + * @param[in] input Input tensor info. Data types supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: S8. + * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + template <typename T> void quantize(const Window &window); + +private: + const ITensor *_input; + ITensor *_output; + ITensor *_scale_factor; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h new file mode 100644 index 000000000..73991b67d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ +#define __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform a reduction operation */ +class NEReductionOperationKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEReductionOperationKernelEx"; } + /** Default constructor */ + NEReductionOperationKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEReductionOperationKernelEx(const NEReductionOperationKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEReductionOperationKernelEx &operator=(const NEReductionOperationKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEReductionOperationKernelEx(NEReductionOperationKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEReductionOperationKernelEx &operator=(NEReductionOperationKernelEx &&) = default; + /** Default destructor */ + ~NEReductionOperationKernelEx() = default; + + /** Set the source, destination of the kernel + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: + * NCHW. + * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] op Reduction operation to perform. + */ + void configure(const ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReductionOperationKernelEx. + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts + * supported: NCHW. + * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p + * input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] op Reduction operation to perform. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, + ReduceOperation op); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; + +private: + const ITensor *_input; + ITensor *_output; + unsigned int _reduction_axis; + ReduceOperation _op; + BorderSize _border_size; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h new file mode 100644 index 000000000..5d697c2b2 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ +#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the space to depth kernel */ +class NESpaceToDepthLayerKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NESpaceToDepthLayerKernelEx"; } + /** Default constructor */ + NESpaceToDepthLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default; + /** Default destructor */ + ~NESpaceToDepthLayerKernelEx() = default; + /** Initialise the kernel's inputs and output. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape value + */ + void configure(const ITensor *input, ITensor *output, int32_t block_shape); + /** Static function to check if given info will lead to a valid configuration of @ref + * NESpaceToDepthLayerKernelEx + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Tensor output info. Data types supported: same as @p input + * @param[in] block_shape Block shape value + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ + int32_t _block_shape; /**< Block shape */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h new file mode 100644 index 000000000..3b0902f08 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_TYPESEX_H__ +#define __ARM_COMPUTE_TYPESEX_H__ + +namespace arm_compute +{ + +/** Available ArgIndex operations **/ +enum class ArgOperation +{ + MAX, + MIN, +}; + +/** Available reduce operations */ +enum class ReduceOperation +{ + MAX, /**< Max */ + MEAN, /**< Mean */ + SUM, /**< Sum */ + MIN, /**< Min */ +}; + +/** Available binary logical operations */ +enum class BinaryLogicalOperation +{ + AND, /**< AND */ + OR, /**< OR */ +}; + +enum class ComparisonOperationEx +{ + EQUAL, /**< EQUAL */ + NOT_EQUAL, /**< NOT_EQUAL */ +}; + +enum class ElementWiseUnaryEx +{ + NEG, /**< NEG */ +}; + +enum class SubDataType +{ + NONE, + BOOL, +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_TYPESEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h new file mode 100644 index 000000000..39026e6bb --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_UTILSEX_H__ +#define __ARM_COMPUTE_UTILSEX_H__ + +#include <utility> + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ + +/** Returns expected width and height of the transpose convolution's output tensor. + * + * @note This function was copied in order to fix a bug computing to wrong output dimensions. + * + * @param[in] in_width Width of input tensor (Number of columns) + * @param[in] in_height Height of input tensor (Number of rows) + * @param[in] kernel_width Kernel width. + * @param[in] kernel_height Kernel height. + * @param[in] info padding and stride info. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_top The number of zeros added to bottom edge of the output. + * + * @return A pair with the new width in the first position and the new height in the second. + */ +const std::pair<unsigned int, unsigned int> +transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_top); +} +#endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h new file mode 100644 index 000000000..16fd40ed9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ +#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Utils.h" + +#include "arm_compute/core/utils/helpers/tensor_transform.h" + +#include <cmath> + +namespace arm_compute +{ +namespace misc +{ +namespace shape_calculator +{ + +/** Calculate the upsampled output shape used for transpose convolution + * + * @param[in] input Input tensor info + * @param[in] weights Weights tensor shape + * @param[in] info Padding and stride info + * @param[in] out_dims Output shape dimensions + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[out] pad_left Padding on left + * @param[out] pad_right Padding on right + * @param[out] pad_top Padding on top + * @param[out] pad_bottom Padding on bottom + * + * @return the calculated shape + */ +inline TensorShape compute_transposeconv_upsampled_shape( + const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, + std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, + unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, + unsigned int &pad_top, unsigned int &pad_bottom) +{ + unsigned int sx = info.stride().first; + unsigned int sy = info.stride().second; + const DataLayout data_layout = input.data_layout(); + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Find the upsampled dimensions + // transpose conv out: + // tconv_out + pad = 1 + (in - 1) * stride + invalid + // tconv_out = 1 + (in - 1) * stride + invalid - pad + // upsample out: + // upsample_out = 1 + (in - 1) * stride + unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1; + unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1; + + // Find the padding needed for the convolution with stride 1 in order to match output shape + // upsample+pad out: + // upsample_out + pad = tconv_out + kernel - 1 + // pad = tconv_out + kernel - 1 - upsample_out + unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1); + unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1); + out_x += padx; + out_y += pady; + + unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right; + unsigned int pady_all_except_invallid = + pady + info.pad_top() + info.pad_bottom() - invalid_bottom; + pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left(); + pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right; + pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top(); + pad_bottom = pady_all_except_invallid / 2 - info.pad_bottom() + invalid_bottom; + + TensorShape scale_out_shape(input.tensor_shape()); + scale_out_shape.set(idx_w, out_x); + scale_out_shape.set(idx_h, out_y); + + return scale_out_shape; +} + +/** Calculate the output shape of the transpose convolution layer + * + * @param[in] out_dims Output x and y shape dimensions + * @param[in] input Input tensor info + * @param[in] weights Weights tensor shape + * + * @return the calculated shape + */ +inline TensorShape +compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, + const ITensorInfo &input, const ITensorInfo &weights) +{ + const TensorShape input_shape{input.tensor_shape()}; + const TensorShape weights_shape{weights.tensor_shape()}; + + const DataLayout data_layout = input.data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + TensorShape out_shape{input_shape}; + out_shape.set(width_idx, out_dims.first); + out_shape.set(height_idx, out_dims.second); + out_shape.set(channel_idx, weights_shape[batch_idx]); + return out_shape; +} + +/** Calculate the depth to space output shape of a tensor + * + * @param[in] input Input tensor info + * @param[in] block Block shape value + * + * @return the calculated shape + */ +inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int block) +{ + ARM_COMPUTE_ERROR_ON(block < 2); + + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + TensorShape output_shape{input->tensor_shape()}; + output_shape.set(idx_width, input->dimension(idx_width) * block); + output_shape.set(idx_height, input->dimension(idx_height) * block); + output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block)); + + return output_shape; +} + +/** Calculate the space to batch output shape of a tensor + * + * @param[in] input Input tensor info + * @param[in] block_shape Block shape value + * + * @return the calculated shape + */ +inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON(block_shape < 2); + TensorShape output_shape{input->tensor_shape()}; + + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape); + output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape); + output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape)); + + return output_shape; +} + +/** Calculate the gather output shape of a tensor + * + * @param[in] input_shape Input tensor shape + * @param[in] indices_shape Indices tensor shape + * @param[in] actual_axis The axis to be gathered + * + * @return the calculated shape + */ +inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape, + const TensorShape &indices_shape, uint32_t actual_axis) +{ + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4); + ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions()); + + TensorShape output_shape = input_shape; + if (indices_shape.num_dimensions() == 1) + { + output_shape[actual_axis] = indices_shape[0]; + } + else if (indices_shape.num_dimensions() > 1) + { + output_shape.shift_right(indices_shape.num_dimensions() - 1); + + for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i) + { + if (o == actual_axis) + { + ++i; + for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o) + { + output_shape[o] = indices_shape[in]; + } + } + else + { + output_shape[o] = input_shape[i]; + } + } + } + return output_shape; +} + +} // namespace shape_calculator +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h new file mode 100644 index 000000000..831bb5423 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__ +#define __ARM_COMPUTE_CLFUNCTIONSEX_H__ + +#include <arm_compute/runtime/CL/functions/CLArgOperation.h> +#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h> +#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h> +#include <arm_compute/runtime/CL/functions/CLCast.h> +#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h> +#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h> +#include <arm_compute/runtime/CL/functions/CLGatherEx.h> +#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h> +#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/CL/functions/CLLogicalNot.h> +#include <arm_compute/runtime/CL/functions/CLNeg.h> +#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h> +#include <arm_compute/runtime/CL/functions/CLPReLU.h> +#include <arm_compute/runtime/CL/functions/CLReduceOperation.h> +#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h> +#include <arm_compute/runtime/CL/functions/CLSpaceToBatchND.h> +#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h> +#include <arm_compute/runtime/CL/functions/CLSplit.h> +#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h> +#include <arm_compute/runtime/CL/functions/CLTopKV2.h> +#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h> + +#endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h new file mode 100644 index 000000000..d9d0d4d35 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLArgOperation.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLArgOperation class + */ + +#ifndef __ARM_COMPUTE_CLARGOPERATION_H__ +#define __ARM_COMPUTE_CLARGOPERATION_H__ + +#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to execute CLArgOperation operation + */ +class CLArgOperation : public IFunction +{ +public: + /** + * @brief Construct a new CLArgOperation object + */ + CLArgOperation(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLArgOperation(const CLArgOperation &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLArgOperation &operator=(const CLArgOperation &) = delete; + + /** + * @brief Construct a new CLArgOperation object by using copy constructor + * @param[in] CLArgOperation object to move + */ + CLArgOperation(CLArgOperation &&) = default; + + /** + * @brief Assign a CLArgOperation object. + * @param[in] CLArgOperation object to assign. This object will be moved. + */ + CLArgOperation &operator=(CLArgOperation &&) = default; + + /** + * @brief Initialise the kernel's inputs and outputs. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The result of arg operation. Data types supported: S32. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Arg operation to perform. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[out] output The result of arg operation. Data types supported: S32. + * @param[in] op Arg operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis, + const ITensorInfo *output, ArgOperation op); + /** + * @brief Run the OpenCL kernel for this operation + * @return N/A + */ + void run() override; + +private: + ICLTensor *_input{nullptr}; + ICLTensor *_output{nullptr}; + std::vector<uint32_t> _axis{}; + ArgOperation _arg_op{ArgOperation::MAX}; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr}; + size_t _num_of_kernels{0}; +}; +} +#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h new file mode 100644 index 000000000..d16a0762d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ +#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLBatchToSpaceNDKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLBatchToSpaceND : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] block_size A pointer to an array of integer values specifying block sizes + * for spatial dimension. + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h new file mode 100644 index 000000000..061e34f26 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLBinaryLogicalOp : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8. + * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8. + * @param[out] output Output tensor. Data types supported: U8, QASYMM8. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h new file mode 100644 index 000000000..36acfaed7 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLCast.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLCast class + */ + +#ifndef __ARM_COMPUTE_CLCAST_H__ +#define __ARM_COMPUTE_CLCAST_H__ + +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLCastKernel. + * This converts the input tensor to the tensor of the output tensor's type. + */ +class CLCast : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's input and output + * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] input_subtype Sub data type of input. + */ + void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype); +}; +} +#endif /* __ARM_COMPUTE_CLCAST_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h new file mode 100644 index 000000000..d78a6ada4 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__ +#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLDepthToSpaceKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLDepthToSpace : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[block_size] block size integer only + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +}; +} // namesace arm_compute + +#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h new file mode 100644 index 000000000..257772a89 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class CLEmbeddingLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); +}; +} +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h new file mode 100644 index 000000000..fd0a65f20 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       CLFullyConnectedReshapingLayer.h + * @brief      This file contains CLFullyConnectedReshapingLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ +#define __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ + +#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h> +#include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h> +#include <arm_compute/runtime/IMemoryManager.h> + +namespace arm_compute +{ +/** + * @brief Class to run FullyConnected Layer after reshaping input tensor + */ +class CLFullyConnectedReshapingLayer : public arm_compute::IFunction +{ +public: + CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) + : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, + _cl_fc{memory_manager}, _cl_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @return N/A + */ + void configure(const arm_compute::ICLTensor *input, const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, arm_compute::ICLTensor *output, + bool needs_reshape, const arm_compute::TensorShape &reshape); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + /** + * @brief Prepare the operation + * @return N/A + */ + void prepare(void) override; + +private: + const arm_compute::ICLTensor *_input; + const arm_compute::ICLTensor *_weights; + const arm_compute::ICLTensor *_biases; + arm_compute::ICLTensor *_output; + + // buffer for reshaping input tensor + arm_compute::CLTensor _cl_buffer; + +private: + arm_compute::CLFullyConnectedLayer _cl_fc; + // TODO Change to CLReshapeLayer + arm_compute::misc::GenericReshapeLayer _cl_reshape; + bool _needs_reshape; +}; +} // namespace arm_compute + +#endif // __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h new file mode 100644 index 000000000..04d227aa7 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLGatherEx.h + * @brief This file contains CLGatherEx class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLGATHEREX_H__ +#define __ARM_COMPUTE_CLGATHEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to to run @ref CLGatherKernel. + */ +class CLGatherEx : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * @return N/A + */ + void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + + /** + * @brief Static function to check if given info will lead to a valid configuration + * of @ref CLGatherEx + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis = 0); +}; +} +#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h new file mode 100644 index 000000000..65aa6cbd5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class CLHashtableLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, + ICLTensor *output, ICLTensor *hits); +}; +} +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h new file mode 100644 index 000000000..ed29db925 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref CLInstanceNormalizationLayerKernelEx + */ +class CLInstanceNormalizationLayerEx : public ICLSimpleFunction +{ +public: + /** Default constructor */ + CLInstanceNormalizationLayerEx(); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h new file mode 100644 index 000000000..4bf203c5a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__ +#define __ARM_COMPUTE_CLLOGICALNOT_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLLogicalNot : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input Source tensor. Data types supported: QASYMM8. + * @param[out] output Output tensor. Data types supported: QASYMM8. + */ + void configure(ICLTensor *input, ICLTensor *output); +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h new file mode 100644 index 000000000..198a0fd4e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNEG_H__ +#define __ARM_COMPUTE_CLNEG_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLNeg : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input Source tensor. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + * + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEG_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h new file mode 100644 index 000000000..622a61b5e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPRELU_H__ +#define __ARM_COMPUTE_CLPRELU_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLPReLU : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input. Data types supported: + * QASYMM8/F16/F32. + * @param[in] alpha. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPRELU_H__*/ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h new file mode 100644 index 000000000..b142d3a2e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLPixelWiseDivision.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLPixelWiseDivision class + */ +#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ +#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLPixelWiseDivisionKernel. + */ +class CLPixelWiseDivision : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * Note: U8 requires both inputs to be U8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or + * 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest + * even. + * @return N/A + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, + ConvertPolicy overflow_policy = ConvertPolicy::WRAP, + RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLPixelWiseDivision + * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * Note: U8 requires both inputs to be U8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n + * where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, float scale = 1.f, + ConvertPolicy overflow_policy = ConvertPolicy::WRAP, + RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); +}; +} +#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h new file mode 100644 index 000000000..7e88cb369 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__ +#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__ + +#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLCopyKernel.h" +#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" +#include "arm_compute/runtime/CL/functions/CLGEMM.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLRNNLayerEx */ +class CLRNNLayerEx : public IFunction +{ +public: + /** Default constructor */ + CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Initialize the function + * + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data + * types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that + * multiplies the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies + * the current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same + * as @p input + * @param[out] output Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] info Activation layer parameter. + */ + void configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info); + /** Initialize the function + * + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data + * types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies + * the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the + * current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p + * input + * @param[in] output Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] info Activation layer parameter. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + CLMemoryGroup _memory_group; + CLGEMM _gemm_state_f; + CLSaturatedArithmeticOperationKernel _add_kernel; + CLActivationLayerKernel _activation_kernel; + CLFullyConnectedLayer _fully_connected_kernel; + CLCopyKernel _copy_kernel; + CLTensor _fully_connected_out; + CLTensor _gemm_output; + CLTensor _add_output; + bool _is_prepared; +}; +} +#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h new file mode 100644 index 000000000..1d367d56b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLReduceOperation.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLReduceOperation class + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATION_H__ + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform ReduceOperation + */ +class CLReduceOperation : public IFunction +{ +public: + /** + * @brief Construct a new ReduceOperation object + */ + CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager); + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, + bool keep_dims, ReduceOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperation. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, const ReduceOperation &op); + + /** + * @brief Run the OpenCL kernel for this operation + * @return N/A + */ + void run() override; + +private: + CLMemoryGroup _memory_group; + ICLTensor *_input; + ICLTensor *_output; + std::set<uint32_t> _axis; + bool _keep_dims; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; + CLReshapeLayer _reshape; +}; +} +#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h new file mode 100644 index 000000000..7e2df8986 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ +#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSpaceToBatchNDKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32. + * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape + * block_shape, and interleaves these blocks with the "batch" dimension such that in the output. + */ +class CLSpaceToBatchND : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @note The data layout of input and output must be the same. + * @note The number of dimensions of input and output must be 4, and `spatial` dimensions + * are height and width. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + * @param[in] block_size Tensor of integer values specifying block sizes for spatial + * dimension. + * Data types supported: S32 + * @param[in] padding_size Tensor of integer values specifying padding sizes for spatial + * dimension. + * Data types supported: S32 + * @param[out] output Output tensor. Data types supported: same as @p input. + * Data layout supported: NCHW/NHWC + */ + void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, + ICLTensor *output); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h new file mode 100644 index 000000000..17f762092 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__ +#define __ARM_COMPUTE_CLSPACETODEPTH_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSpaceToDepthKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLSpaceToDepth : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[block_size] block size integer only + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h new file mode 100644 index 000000000..6b26a85c8 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLStridedSlice.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class + */ + +#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ +#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLStridedSliceKernel + */ +class CLStridedSliceEx : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs and outputs + * @param[in] input Tensor input. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Output tensor. Data type supported: Same as @p input + * @param[in] beginData 'begin' vector of strided slice operation + * @param[in] endData 'end' vector of strided slice operation + * @param[in] stridesData 'strides' vector of strided slice operation + * @param[in] beginMask If the ith bit is set, begin[i] is ignored + * @param[in] endMask If the ith bit is set, end[i] is ignored + * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the + * dimensionality by 1, taking on the value at index begin[i] + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask); +}; +} +#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h new file mode 100644 index 000000000..20c749e0b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLTopKV2.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLTopKV2 class + */ +#ifndef __ARM_COMPUTE_CLTOPK_V2_H__ +#define __ARM_COMPUTE_CLTOPK_V2_H__ + +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to execute TopKV2 operation. + */ +class CLTopKV2 : public IFunction +{ +public: + /** + * @brief Construct a new CLTopKV2 object + */ + CLTopKV2(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLTopKV2(const CLTopKV2 &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLTopKV2 &operator=(const CLTopKV2 &) = delete; + + /** + * @brief Construct a new CLTopKV2 object by using copy constructor + * @param[in] CLTopKV2 object to move + */ + CLTopKV2(CLTopKV2 &&) = default; + + /** + * @brief Assign a CLTopKV2 object. + * @param[in] CLTopKV2 object to assign. This object will be moved. + */ + CLTopKV2 &operator=(CLTopKV2 &&) = default; + + /** + * @brief Initialise the kernel's inputs and outputs. + * @param[in] input Input image. Data types supported: U8/S16/F32. + * @param[in] k The value of `k`. + * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if + * input type is F32. + * @param[out] indices Indices related to top k values. Data types supported: S32 if input type + * is U8/S16, F32 if input type is F32. + * @return N/A + */ + void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits = 32, int bits = 4); + + /** + * @brief Run the kernels contained in the function + * Depending on the value of the following environment variables it works differently: + * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE", + * quick sort on GPU is used. + * - If the value of environment variable "ACL_TOPKV2" == ""GPU"", + * radix sort on GPU is used. + * - For other value, TopKV2 runs on CPU + * @return N/A + */ + void run() override; + +private: + void run_on_cpu(); + void run_on_gpu(); + void run_on_gpu_single_quicksort(); + + uint32_t _k; + uint32_t _total_bits; + uint32_t _bits; + uint32_t _radix; + uint32_t _hist_buf_size; + uint32_t _glob_sum_buf_size; + uint32_t _n; + + ICLTensor *_input; + ICLTensor *_values; + ICLTensor *_indices; + + cl::Buffer _qs_idx_buf; + cl::Buffer _qs_temp_buf; + cl::Buffer _hist_buf; + cl::Buffer _glob_sum_buf; + cl::Buffer _temp_buf; + cl::Buffer _first_negative_idx_buf; + cl::Buffer _in_key_buf; + cl::Buffer _out_key_buf; + cl::Buffer _in_ind_buf; + cl::Buffer _out_ind_buf; + + cl::Buffer *_p_in_key_buf; + cl::Buffer *_p_out_key_buf; + cl::Buffer *_p_in_ind_buf; + cl::Buffer *_p_out_ind_buf; +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + CLTopKV2Single _qs_kernel; + CLTopKV2Init _init_kernel; + CLRadixSortHistogram _hist_kernel; + CLRadixSortScanHistogram _scan_hist_kernel; + CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel; + CLRadixSortPasteHistogram _paste_hist_kernel; + CLRadixSortReorder _reorder_kernel; + CLTopKV2FindFirstNegative _find_first_negative_kernel; + CLTopKV2ReorderNegatives _reorder_negatives_kernel; + CLTopKV2Store _store_kernel; +#endif +}; +} +#endif // __ARM_COMPUTE_CLTOPK_V2_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h new file mode 100644 index 000000000..340a7bfe9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" + +#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" + +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +#include <memory> + +namespace arm_compute +{ +class ICLTensor; +/** Function to run the transpose convolution layer. + * + * @note This layer was copied in order to fix a bug computing to wrong output dimensions. + * + * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input + * depending on the stride and pad info and then perform a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input, pad is the amount of padding and finally a is a user + * specified value where a < stride - 1, that increases the padding top and right of the input + * image. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y + * \f] + * + * where: + * width_input is the size of the first input dimension. + * height_input is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. + * Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using the @ref + * CPPFlipWeightsKernel. + * + * This function calls the following OpenCL kernels/functions: + * + * -# @ref CLTransposeConvLayerUpsample + * -# @ref CLConvolutionLayer + * + */ +class CLTransposeConvLayer : public IFunction +{ +public: + /** Constructor */ + CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayer(const CLTransposeConvLayer &) = delete; + /** Default move constructor */ + CLTransposeConvLayer(CLTransposeConvLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete; + /** Default move assignment operator */ + CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default; + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: + * Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions + * as the @p input. + * @param[in] info Contains padding and policies to be used in the + * transpose convolution, this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref + * CLConvolutionLayer, specifies if the weights tensor has been + * reshaped with @ref CLWeightsReshapeKernel. + */ + void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLTransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: + * Same as @p input. + * @param[in] output Output tensor info. The output has the same number of dimensions + * as the @p input. + * @param[in] info Contains padding and policies to be used in the + * transpose convolution, this is decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref + * CLWeightsReshapeKernel. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, + unsigned int innvalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + CLMemoryGroup _memory_group; + CLTransposeConvLayerUpsample _scale_f; + CLConvolutionLayer _conv_f; + CPPFlipWeightsKernel _flip_weights; + CLTensor _scaled_output; + ICLTensor *_original_weights; + CLTensor _weights_flipped; + bool _is_prepared; +}; +} +#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h new file mode 100644 index 000000000..4ae0e1830 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ +#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */ +class CLTransposeConvLayerUpsample : public IFunction +{ +public: + /** Default constructor */ + CLTransposeConvLayerUpsample(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete; + /** Allow instances of this class to be moved */ + CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default; + /** Allow instances of this class to be moved */ + CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default; + /** Default destructor */ + virtual ~CLTransposeConvLayerUpsample() = default; + + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] inner_border The number of zeros added to right and top edges of the input. + * @param[in] info Contains padding and policies to be used in the deconvolution. + */ + void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, + const PadStrideInfo &info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLTransposeConvLayerUpsample + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input. + * @param[in] inner_border The number of zeros added to right and top edges of the input. + * @param[in] info Contains padding and policies to be used in the deconvolution. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const BorderSize &inner_border, const PadStrideInfo &info); + + // Inherited methods overridden: + void run() override; + +private: + CLTransposeConvLayerUpsampleKernel _upsample; + ICLTensor *_output; +}; +} +#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h new file mode 100644 index 000000000..8e7e2f937 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ +#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ + +#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h" + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref CPPUpsample */ +class CPPUpsampleEx : public ICPPSimpleFunction +{ +public: + /** Configure the upsample CPP kernel + * + * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8 + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] info Padding information + */ + void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); +}; +} +#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h new file mode 100644 index 000000000..37bccc52c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__ +#define __ARM_COMPUTE_NEFUNCTIONSEX_H__ + +#include <arm_compute/runtime/NEON/functions/NEArgMinMax.h> +#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h> +#include <arm_compute/runtime/NEON/functions/NECast.h> +#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h> +#include <arm_compute/runtime/NEON/functions/NEGatherEx.h> +#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h> +#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEPReLU.h> +#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h> +#include <arm_compute/runtime/NEON/functions/NEReduceSum.h> +#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h> +#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h> + +#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h new file mode 100644 index 000000000..604cd93c4 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ +#define __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce min/max operation */ +template <ReductionOperation op> class NEArgMinMaxStatic : public IFunction +{ +public: + /** Constructor */ + NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] axis Reduction axis. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, int axis, ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMax + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] axis Reduction axis. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + NEArgMinMaxLayer _reduction_kernel; + Tensor _reduced_out; + NEReshapeLayer _reshape; +}; + +/** Basic function to run arg max. */ +using NEArgMax = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>; +/** Basic function to run arg min. */ +using NEArgMin = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h new file mode 100644 index 000000000..2a624656d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ +#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ + +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEBinaryLogicalOperationKernel. + * + * @note The tensor data type for the inputs must be QASYMM8/U8. + * @note The function performs a binary logical operation between two tensors. + */ +class NEBinaryLogicalOperation : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8. + * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + * @param[in] op Binary Logical Operation to be performed. + */ + void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * @param[in] op Binary Logical Operation to be performed. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op); +}; + +/** Basic function to run @ref NEBinaryLogicalOperationKernel + * + * @note The tensor data type for the inputs must be QASYMM8/U8. + * @note The function performs a binary logical operation between two tensors. + */ +template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8 + * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + */ + void configure(ITensor *input1, ITensor *input2, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8 + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output); +}; + +/** Basic function to run equal comparison. */ +using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +/** Basic function to run not equal comparison. */ +using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h new file mode 100644 index 000000000..ae2f57f19 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECAST_H__ +#define __ARM_COMPUTE_NECAST_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */ +class NECast : public INESimpleFunctionNoBorder +{ +public: + /** Configure the kernel. + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: + * U8/S8/QASYMM8/U32/S32/F32. + * @param[in] input_subtype Sub data type of input. + */ + void configure(const ITensor *input, ITensor *output, + SubDataType input_subtype = SubDataType::NONE); + /** Static function to check if given info will lead to a valid configuration of @ref NECast + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[in] input_subtype Sub data type of input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype = SubDataType::NONE); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NECAST_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h new file mode 100644 index 000000000..90c0751b8 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ +#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */ +class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape value. + */ + void configure(const ITensor *input, ITensor *output, int32_t block_shape); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEDepthToSpaceLayerEx. + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Tensor output info. Data types supported: same as @p input + * @param[in] block_shape Block shape x value. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h new file mode 100644 index 000000000..f0c8ecdb5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ +#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform negative on an input tensor. */ +class NENegLayer : public INESimpleFunction +{ +public: + /** Initialize the function + * + * @param[in] input Input tensor. Data types supported: F16/F32/S32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer + * + * @param[in] input First tensor input info. Data types supported: F16/F32/S32. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h new file mode 100644 index 000000000..0646f1668 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file NEEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::NEEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include <vector> + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class NEEmbeddingLookup : public INESimpleFunctionNoBorder +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32. + * @return N/A + */ + void configure(const ITensor *input, ITensor *output, const ITensor *lookups); + /** Static function to check if given info will lead to a valid configuration of @ref NECopy + * + * @param[in] input Source tensor info. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * @param[in] output Lookups tensor info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); +}; +} +#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h new file mode 100644 index 000000000..42a786821 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ +#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ + +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls + * the following kernels: + * + * -# @ref NETransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedHybridLayerReshapeWeights + * + * @param[in] input Weights tensor info. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; + +/** Basic function to compute a Fully Connected layer on NEON. This function calls the following + * NEON kernels: + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false + * and transpose_weights is set to true ) (called once) + * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class NEFullyConnectedHybridLayer : public IFunction +{ +public: + /** Constructor */ + NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete; + /** Default move constructor */ + NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete; + /** Default move assignment operator */ + NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, + ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedHybridLayer + * + * @param[in] input Source tensor info. Data type supported: F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + + MemoryGroup _memory_group; + NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function; + NEQuantizationSymmetricKernel _quant_input_kernel; + NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp; + NEMultiplyScaleFactorKernel _multiply_scale_kernel; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _reshape_weights_output; + Tensor _quantized_input; + Tensor _scale_factor; + Tensor _gemmlowp_output; + const ITensor *_original_weights; + bool _are_weights_reshaped; + bool _accumulate_biases; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h new file mode 100644 index 000000000..6bd67f322 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ +#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" +#include "arm_compute/runtime/NEON/functions/NEGEMM.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +/** Basic function to compute a Fully Connected layer on NEON. This function calls the following + * NEON kernels: + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and + * transpose_weights is set to true ) (called once) + * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + * @note The difference from NEFullyConnectedLayer is that this class supports weights as input + * with performance loss. + */ +class NEFullyConnectedLayerEx : public IFunction +{ +public: + /** Constructor */ + NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete; + /** Default move constructor */ + NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete; + /** Default move assignment operator */ + NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, + ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedLayerEx + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + + MemoryGroup _memory_group; + NEFlattenLayerKernel _flatten_kernel; + NEConvertFullyConnectedWeights _convert_weights; + NEFullyConnectedLayerReshapeWeights _reshape_weights_function; + NEGEMM _mm_gemm; + NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _flatten_output; + Tensor _gemmlowp_output; + Tensor _converted_weights_output; + Tensor _reshape_weights_output; + const ITensor *_original_weights; + bool _are_weights_converted; + bool _are_weights_reshaped; + bool _is_fc_after_conv; + bool _accumulate_biases; + bool _is_quantized; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h new file mode 100644 index 000000000..18cb61bf9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       NEFullyConnectedReshapingLayer.h + * @brief      This file contains NEFullyConnectedReshapingLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ +#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ + +#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> +#include <arm_compute/runtime/IMemoryManager.h> +#include <arm_compute/runtime/Tensor.h> + +namespace arm_compute +{ +/** + * @brief Class to run FullyConnected Layer after reshaping input tensor + */ +class NEFullyConnectedReshapingLayer : public arm_compute::IFunction +{ +public: + enum class KernelType + { + GENERAL, //< General FC + PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed + }; + +public: + NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) + : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), + _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @param[in] kernel_type The kernel type for actual FullyConnected layer + * @return N/A + */ + void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, arm_compute::ITensor *output, + bool needs_reshape, const arm_compute::TensorShape &reshape, + KernelType kernel_type); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + /** + * @brief Prepare the operation + * @return N/A + */ + void prepare(void) override; + +private: + std::shared_ptr<IMemoryManager> _memory_manager; + const arm_compute::ITensor *_input; + const arm_compute::ITensor *_weights; + const arm_compute::ITensor *_biases; + arm_compute::ITensor *_output; + + // buffer for reshaping input tensor + arm_compute::Tensor _neon_buffer; + +private: + std::unique_ptr<arm_compute::IFunction> _neon_fc; + NEReshapeLayer _neon_reshape; + bool _needs_reshape; +}; +} // namespace arm_compute + +#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h new file mode 100644 index 000000000..414b9f7d9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ +#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following + * NEON kernels if the DOT product instruction is not available: + * + * -# @ref NEGEMMInterleave4x4Kernel + * -# @ref NEGEMMTranspose1xWKernel + * -# @ref NEGEMMLowpMatrixMultiplyKernel + * -# @ref NEGEMMLowpOffsetContributionKernel + * -# @ref NEActivationLayer + * + * otherwise if the DOT product instruction is available: + * + * -# @ref NEGEMMLowpOffsetContributionKernel + * +*/ +class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction +{ +public: + /** Constructor */ + NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete; + /** Default move constructor */ + NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete; + /** Default move assignment operator */ + NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default; + /** Initialise the kernel's inputs, output + * + * @note GEMM_LOWP: low precision GEMM kernel + * This kernel performs the following computations: + * + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is + * QASYMM8/QASYMM8_SIGNED otherwise + * + * @param[in] a First input tensor (Matrix A). Data type supported: + * QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: + * S32 + * @param[out] output Output tensor. Data type supported: Data type supported: + * S32/QASYMM8/QASYMM8_SIGNED + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped + * and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, + const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGEMMLowpMatrixMultiplyCoreEx + * + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is + * QASYMM8/QASYMM8_SIGNED otherwise + * + * @param[in] a First input tensor info (Matrix A). Data type supported: + * QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type + * supported: S32 + * @param[in] output Output tensor info. Data type supported: Data type supported: + * S32/QASYMM8/QASYMM8_SIGNED + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped + * and + * if the reshape of matrix B should be executed only for the first run + * + * @return a status + */ + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, + const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + NEGEMMAssemblyDispatch _asm_glue; + std::unique_ptr<INEKernel> _mm_kernel; + std::unique_ptr<INEKernel> _mtx_a_reshape_kernel; + std::unique_ptr<INEKernel> _mtx_b_reshape_kernel; + NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; + NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; + NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel; + NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel; + // NEActivationLayer _activation_func; + + Tensor _vector_sum_col; + Tensor _vector_sum_row; + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _mm_result_s32; + Tensor _signed_a; + Tensor _signed_output; + const ITensor *_original_b; + int32_t _a_offset; + int32_t _b_offset; + + bool _run_vector_matrix_multiplication; + bool _assembly_path; + bool _fused_assembly_path; + bool _reshape_b_only_on_first_run; + bool _is_prepared; + bool _fuse_output_stage; + bool _run_activation; + bool _flip_signedness; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h new file mode 100644 index 000000000..d95e6a81e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEGATHEREX_H__ +#define __ARM_COMPUTE_NEGATHEREX_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEGatherKernelEx */ +class NEGatherEx : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + */ + void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGatherKernelEx + * + * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] output Destination tensor info. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis); +}; + +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h new file mode 100644 index 000000000..69abf0192 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file NEHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::NEHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include <vector> + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class NEHashtableLookup : public INESimpleFunctionNoBorder +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32 + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, + ITensor *hits); + /** Static function to check if given info will lead to a valid configuration of @ref NECopy + * + * @param[in] lookups Lookups 1D tensor info. + * Data types supported: S32 + * @param[in] keys Keys 1D tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] hits Hits 1D tensor info. A boolean tensor that indicates whether the lookup + * hits (True) or not (False). Data types supported: U8/QASYMM8 + * + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); +}; +} +#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h new file mode 100644 index 000000000..521f50d2f --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref NEInstanceNormalizationLayerKernelEx + */ +class NEInstanceNormalizationLayerEx : public IFunction +{ +public: + /** Constructor */ + NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. + * Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEInstanceNormalizationLayer. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults + * to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + NEInstanceNormalizationLayerKernelEx _normalization_kernel; + bool _is_nchw; + NEPermute _permute_input; + NEPermute _permute_output; + Tensor _permuted_input; + Tensor _permuted_output; +}; +} +#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h new file mode 100644 index 000000000..5664c57cb --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEPRELU_H__ +#define __ARM_COMPUTE_NEPRELU_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEPReLUKernel */ +class NEPReLU : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and output + * + * @param[in] input. Data types supported: QASYMM8/F32. + * @param[in] alpha. Data types supported: Same as @p input. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(const ITensor *input, const ITensor *alpha, ITensor *output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEPRELU_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h new file mode 100644 index 000000000..17c37d806 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__ +#define __ARM_COMPUTE_NERNNLAYER_EX_H__ + +#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" +#include "arm_compute/core/NEON/kernels/NECopyKernel.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NEGEMM.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** Basic function to run @ref NERNNLayerEx */ +class NERNNLayerEx : public IFunction +{ +public: + /** Default constructor */ + NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NERNNLayerEx(const NERNNLayerEx &) = delete; + /** Default move constructor */ + NERNNLayerEx(NERNNLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NERNNLayerEx &operator=(const NERNNLayerEx &) = delete; + /** Default move assignment operator */ + NERNNLayerEx &operator=(NERNNLayerEx &&) = default; + /** Initialize the function + * + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data + * types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that + * multiplies the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies + * the current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same + * as @p input + * @param[out] output Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] info Activation layer parameter. + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, + const ITensor *bias, ITensor *hidden_state, ITensor *output, + ActivationLayerInfo &info); + /** Initialize the function + * + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data + * types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies + * the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the + * current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p + * input + * @param[in] output Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] info Activation layer parameter. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + NEGEMM _gemm_state_f; + NEArithmeticAdditionKernel _add_kernel; + NEActivationLayerKernel _activation_kernel; + NEFullyConnectedLayer _fully_connected_kernel; + NECopyKernel _copy_kernel; + Tensor _fully_connected_out; + Tensor _gemm_output; + Tensor _add_output; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h new file mode 100644 index 000000000..7209acf19 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ +#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceMeanEx : public IFunction +{ +public: + /** Constructor */ + NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReduceMeanEx + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr}; + std::unique_ptr<Tensor[]> _reduced_outs{nullptr}; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h new file mode 100644 index 000000000..9c558e6a2 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ +#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceOperation : public IFunction +{ +public: + /** Constructor */ + NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] op Reduce operation to perform. + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output, + ReduceOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReduceOperation + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * @param[in] op Reduce operation to perform. + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReduceOperation op); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<NEReductionOperationEx> _reduction_kernels; + std::vector<Tensor> _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h new file mode 100644 index 000000000..c028ea658 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__ +#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceSum : public IFunction +{ +public: + /** Constructor */ + NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<NEReductionOperation> _reduction_kernels; + std::vector<Tensor> _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h new file mode 100644 index 000000000..7180742df --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ +#define __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to simulate a reduction operation. This function calls the following NEON + * kernels: + * + * -# @ref NEFillBorderKernel + * -# @ref NEReductionOperationKernelEx + * + */ +class NEReductionOperationEx : public IFunction +{ +public: + /** Default constructor */ + NEReductionOperationEx(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. + * @param[in] axis Dimension along which to reduce. + * @param[in] op Reduction operation to perform. + */ + void configure(ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReductionOperationEx. + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] axis Dimension along which to reduce. + * @param[in] op Reduction operation to perform. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, + ReduceOperation op); + + // Inherited methods overridden: + void run() override; + +private: + NEReductionOperationKernelEx _reduction_kernel; + NEFillBorderKernel _fill_border_kernel; + size_t _window_split; + int _reduction_axis; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h new file mode 100644 index 000000000..302f9af2e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ +#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h" +#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to spatial divide a tensor. This function calls the following NEON + * kernels/functions: + * + * -# @ref NEMemsetKernel + * -# @ref NESpaceToBatchLayerKernel + */ +class NESpaceToBatchLayerEx : public IFunction +{ +public: + /** Default constructor */ + NESpaceToBatchLayerEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete; + /** Allow instances of this class to be moved */ + NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default; + /** Allow instances of this class to be moved */ + NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default; + /** Default destructor */ + virtual ~NESpaceToBatchLayerEx() = default; + /** Set the input and output tensors. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32 + * @param[in] paddings 2-D tensor with shape [2, M]. Data types supported: S32 + * @param[out] output Tensor output. Data types supported: same as @p input + */ + void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, + ITensor *output); + /** Set the input and output tensors. (Static block shape and paddings) + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] block_shape_x Block shape x value. + * @param[in] block_shape_y Block shape y value. + * @param[in] padding_left The left padding of the output tensor. + * @param[in] padding_right The right padding of the output tensor. + * @param[out] output Tensor output. Data types supported: same as @p input + */ + void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, + const Size2D &padding_left, const Size2D &padding_right, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NESpaceToBatchLayerEx + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32 + * @param[in] paddings paddings tensor info with shape [2, M]. Data types supported: S32 + * @param[in] output Tensor output info. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, + const ITensorInfo *paddings, const ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NESpaceToBatchLayerEx (Static block shape and paddings) + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] block_shape_x Block shape x value. + * @param[in] block_shape_y Block shape y value. + * @param[in] padding_left The left padding of the output tensor. + * @param[in] padding_right The right padding of the output tensor. + * @param[in] output Tensor output info. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, + const Size2D &padding_left, const Size2D &padding_right, + const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */ + NEMemsetKernel _memset_kernel; /**< Memset kernel to run */ + bool _has_padding; /**< Flag to check if the output has padding */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h new file mode 100644 index 000000000..117717b55 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ +#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** This function calls the following NEON kernels/functions: + * + * -# @ref NESpaceToDepthLayerKernelEx + */ +class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape value + */ + void configure(const ITensor *input, ITensor *output, int32_t block_shape); + /** Static function to check if given info will lead to a valid configuration of @ref + * NESpaceToDepthLayerEx (Static block shape and paddings) + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Tensor output info. Data types supported: same as @p input + * @param[in] block_shape Block shape value + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h new file mode 100644 index 000000000..a50b9ea60 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEPermute.h" + +#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +/** Function to run the deconvolution layer. + * + * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the + * input depending on the stride and pad info and then perfrom a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input, pad is the amount of padding and finaly a is a user + * specified value where a < stride - 1 that increases the padding top and right of the input image. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y + * \f] + * + * where + * width is the size of the first input dimension. + * height is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Transpose convolution are supposed to be the same as the ones used for + * Convolution. Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using the @ref + * CPPFlipWeightsKernel. + * + * This function calls the following NEON kernels/functions: + * + * -# @ref CPPUpsample + * -# @ref NEConvolutionLayer + * + */ +class NETransposeConvLayer : public IFunction +{ +public: + /** Default constructor */ + NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeConvLayer(const NETransposeConvLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete; + /** Allow instances of this class to be moved */ + NETransposeConvLayer(NETransposeConvLayer &&) = default; + /** Allow instances of this class to be moved */ + NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default; + /** Default destructor */ + virtual ~NETransposeConvLayer() = default; + + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type + * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p + * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + /** Static function to check if given info will lead to a valid configuration of @ref + * NETransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types + * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the @p + * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + NEConvolutionLayer _conv_f; + CPPUpsampleEx _upsample_f; + CPPFlipWeightsKernel _flip_weights; + NEPermute _permute_input; + NEPermute _permute_weights; + NEPermute _permute_output; + Tensor _scaled_output; + Tensor _weights_flipped; + Tensor _permuted_input; + Tensor _permuted_weights; + Tensor _permuted_output; + bool _is_nchw; + const ITensor *_original_weights; + ITensor *_input; + PadStrideInfo _info; + bool _is_prepared; +}; +} // arm_compute +#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h new file mode 100644 index 000000000..3db0c7e5e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       GenericGather.h + * @brief      This file contains GenericGather class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ +#define __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ + +#include <arm_compute/runtime/Tensor.h> +#include <arm_compute/runtime/CL/CLTensor.h> + +#include <arm_compute/runtime/CL/functions/CLPermute.h> +#include <arm_compute/runtime/CL/functions/CLGatherEx.h> + +#include "Utils.h" + +namespace arm_compute +{ +namespace misc +{ + +/** + * @brief Class to run Gather with both CPU and GPU + */ +class GenericGather : public arm_compute::IFunction +{ +public: + GenericGather(void) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] indices The indices tensor + * @param[in] output The destination tensor + * @param[in] axis (Optional) The axis in input to gather indices from + * @return N/A + */ + void configure(arm_compute::ITensor *input, arm_compute::ITensor *indices, + arm_compute::ITensor *output, int axis = 0); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + +private: + arm_compute::ITensor *_input{nullptr}; + arm_compute::ITensor *_indices{nullptr}; + arm_compute::ITensor *_output{nullptr}; + int _axis{0}; + arm_compute::CLTensor _cl_permuted; + +private: + arm_compute::CLPermute _cl_permute; + arm_compute::CLGatherEx _cl_gather; +}; + +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h new file mode 100644 index 000000000..ab2fdc71d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       GenericReshapeLayer.h + * @brief      This file contains GenericReshapeLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__ +#define __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__ + +#include <arm_compute/runtime/Tensor.h> +#include <arm_compute/runtime/CL/CLTensor.h> + +#include <arm_compute/runtime/CL/functions/CLPermute.h> +#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h> +#include <arm_compute/runtime/NEON/functions/NEPermute.h> +#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> + +#include "Utils.h" + +namespace arm_compute +{ +namespace misc +{ + +/** + * @brief Class to run Reshape Layer with both CPU and GPU + */ +class GenericReshapeLayer : public arm_compute::IFunction +{ +public: + GenericReshapeLayer(void) + : _input(nullptr), _output(nullptr), _cl_permuted{}, _neon_permuted{}, _cl_permute{}, + _cl_reshape{}, _neon_permute{}, _neon_reshape{} + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] output The destination tensor + * @return N/A + */ + void configure(const arm_compute::ITensor *input, arm_compute::ITensor *output); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + +private: + const arm_compute::ITensor *_input; + arm_compute::ITensor *_output; + arm_compute::CLTensor _cl_permuted; + arm_compute::Tensor _neon_permuted; + +private: + arm_compute::CLPermute _cl_permute; + arm_compute::CLReshapeLayer _cl_reshape; + + arm_compute::NEPermute _neon_permute; + arm_compute::NEReshapeLayer _neon_reshape; +}; + +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h new file mode 100644 index 000000000..53736f55f --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file utils.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains utils for arm compute library + */ +#ifndef __ARM_COMPUTE_MISC_UTILS_H__ +#define __ARM_COMPUTE_MISC_UTILS_H__ + +#include <string> +#include <cassert> +#include <arm_compute/runtime/CL/CLTensor.h> + +#include <arm_compute/core/Coordinates.h> +#include <arm_compute/core/TensorInfo.h> +#include <arm_compute/core/TensorShape.h> +#include <arm_compute/core/Types.h> + +// TODO : It should be extracted to independent module. + +namespace arm_compute +{ +namespace misc +{ +namespace utils +{ + +/** + * @brief Check if this runtime runs on GPU or NEON + * @return @c true if GPU mode, otherwise @c false + */ +bool isGpuMode(); + +#ifndef CAST_CL +#define CAST_CL(tensor) static_cast<::arm_compute::CLTensor *>(tensor) +#endif + +#ifndef CAST_NE +#define CAST_NE(tensor) static_cast<::arm_compute::Tensor *>(tensor) +#endif + +/** +* @brief Generate arm compute permutation vector from runtime permutation vector +* @param[in] rank Rank number supported upto 4 +* @param[in] runtime_pv Integer array for runtime permutation vector +* @return Permutation vector of arm compute +*/ +arm_compute::PermutationVector getARMComputePermutationVector(uint32_t rank, + const int32_t *runtime_pv); + +/** + * @brief Set value to arm compute tensor with casting + * @param[in] value Value to set + * @param[out] to Target tensor of arm compute + * @param[in] id Position of element + * @return N/A + */ +template <typename FromT> +void copyCast(const FromT value, arm_compute::ITensor *to, const arm_compute::Coordinates &id) +{ + switch (to->info()->data_type()) + { + case arm_compute::DataType::F32: + { + *reinterpret_cast<float *>(to->ptr_to_element(id)) = static_cast<float>(value); + break; + } + case arm_compute::DataType::S32: + { + *reinterpret_cast<int32_t *>(to->ptr_to_element(id)) = static_cast<int32_t>(value); + break; + } + case arm_compute::DataType::U32: + { + *reinterpret_cast<uint32_t *>(to->ptr_to_element(id)) = static_cast<uint32_t>(value); + break; + } + case arm_compute::DataType::QASYMM8: + { + float realValue = static_cast<float>(value); + // NOTE We haven't known the policy of rounding for quantization. + // So this is set to a temporary value. + *(to->ptr_to_element(id)) = + to->info()->quantization_info().quantize(realValue, arm_compute::RoundingPolicy::TO_ZERO); + break; + } + default: + throw std::runtime_error("Not supported, yet"); + break; + } +} + +} // namespace utils +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_UTILS_H__ diff --git a/compute/ARMComputeEx/resolve_includes.py b/compute/ARMComputeEx/resolve_includes.py new file mode 100644 index 000000000..b3e252892 --- /dev/null +++ b/compute/ARMComputeEx/resolve_includes.py @@ -0,0 +1,102 @@ +# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +# Copyright (c) 2016, 2017 ARM Limited. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import collections +import os.path +import re +import subprocess +import glob + + +def resolve_includes(target, source): + # File collection + FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents') + + # Include pattern + pattern = re.compile("#include \"(.*)\"") + + # Get file contents + files = [] + for i in range(len(source)): + src = source[i] + dst = target[i] + f = open(src) + cts = f.read() + f.close() + contents = cts.splitlines() + entry = FileEntry(target_name=dst, file_contents=contents) + files.append((os.path.basename(src), entry)) + + # Create dictionary of tupled list + files_dict = dict(files) + + # Check for includes (can only be files in the same folder) + final_files = [] + for file in files: + done = False + tmp_file = file[1].file_contents + print(file[1].target_name) + while not done: + file_count = 0 + updated_file = [] + for line in tmp_file: + found = pattern.search(line) + if found: + include_file = found.group(1) + data = files_dict[include_file].file_contents + updated_file.extend(data) + else: + updated_file.append(line) + file_count += 1 + + # Check if all include are replaced. + if file_count == len(tmp_file): + done = True + + # Update temp file + tmp_file = updated_file + + # Append and prepend string literal identifiers and add expanded file to final list + tmp_file.insert(0, "R\"(\n") + tmp_file.append("\n)\"") + entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file) + final_files.append((file[0], entry)) + + # Write output files + for file in final_files: + with open(file[1].target_name, 'w+') as out_file: + out_file.write("\n".join(file[1].file_contents)) + + +# Generate embed files +cl_files = glob.glob('src/core/CL/cl_kernels/*.cl') +cl_files += glob.glob('src/core/CL/cl_kernels/*.h') + +# DEBUG: print cl files +print("cl_files:") +print(cl_files) + +embed_files = [f + "embed" for f in cl_files] +print("embed_files:") +print(embed_files) + +resolve_includes(embed_files, cl_files) diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp new file mode 100644 index 000000000..7d4760600 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <utility> +#include <vector> + +using namespace arm_compute; + +const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { + // ARMComputeEx kernels + {"arg_op", "arg_operation.cl"}, + {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, + {"cast", "cast.cl"}, + {"cast_qasymm_in", "cast.cl"}, + {"cast_qasymm_out", "cast.cl"}, + {"comparison_op", "comparison_op.cl"}, + {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, + {"depth_to_space_nchw", "depth_to_space.cl"}, + {"depth_to_space_nhwc", "depth_to_space.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"gather_ex", "gather_ex.cl"}, + {"gather_ex_1d", "gather_ex.cl"}, + {"gather_ex_1d_out", "gather_ex.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"permute_generic", "permute_ex.cl"}, + {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, + {"prelu", "prelu.cl"}, + {"prelu_qasymm8", "prelu_quantized.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"topkv2_init", "topkv2.cl"}, + {"topkv2_find_first_negative", "topkv2.cl"}, + {"topkv2_reorder_negatives", "topkv2.cl"}, + {"topkv2_store", "topkv2.cl"}, + {"radixsort_histogram", "topkv2_radixsort.cl"}, + {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, + {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, + {"radixsort_reorder", "topkv2_radixsort.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"space_to_batch_4d_nchw", "space_to_batch.cl"}, + {"space_to_batch_4d_nhwc", "space_to_batch.cl"}, + {"space_to_depth_nchw", "space_to_depth.cl"}, + {"space_to_depth_nhwc", "space_to_depth.cl"}, +}; + +const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { +#ifdef EMBEDDED_KERNELS + { + "arg_operation.cl", +#include "./cl_kernels/arg_operation.clembed" + }, + { + "cast.cl", +#include "./cl_kernels/cast.clembed" + }, + { + "embedding_lookup.cl", +#include "./cl_kernels/embedding_lookup.clembed" + }, + { + "depth_to_space.cl", +#include "./cl_kernels/depth_to_space.clembed" + }, + { + "gather_ex.cl", +#include "./cl_kernels/gather_ex.clembed" + }, + { + "hashtable_lookup.cl", +#include "./cl_kernels/hashtable_lookup.clembed" + }, + { + "helpers.h", +#include "./cl_kernels/helpers.hembed" + }, + { + "helpers_asymm.h", +#include "./cl_kernels/helpers_asymm.hembed" + }, + { + "instance_normalization_ex.cl", +#include "./cl_kernels/instance_normalization_ex.clembed" + }, + { + "binary_logical_op.cl", +#include "./cl_kernels/binary_logical_op.clembed" + }, + { + "neg_tensor.cl", +#include "./cl_kernels/neg_tensor.clembed" + }, + { + "prelu.cl", +#include "./cl_kernels/prelu.clembed" + }, + { + "prelu_quantized.cl", +#include "./cl_kernels/prelu_quantized.clembed" + }, + { + "reduce_operation.cl", +#include "./cl_kernels/reduce_operation.clembed" + }, + { + "space_to_batch.cl", +#include "./cl_kernels/space_to_batch.clembed" + }, + { + "space_to_depth.cl", +#include "./cl_kernels/space_to_depth.clembed" + }, + { + "topkv2.cl", +#include "./cl_kernels/topkv2.clembed" + }, + { + "topkv2_radixsort.cl", +#include "./cl_kernels/topkv2_radixsort.clembed" + }, + { + "topkv2_quicksort.cl", +#include "./cl_kernels/topkv2_quicksort.clembed" + }, + +#endif /* EMBEDDED_KERNELS */ +}; + +CLKernelLibraryEx::CLKernelLibraryEx() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() +{ + opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the + // CLKernelLibraryEx is built +} + +CLKernelLibraryEx &CLKernelLibraryEx::get() +{ + static CLKernelLibraryEx _kernel_library; + return _kernel_library; +} + +Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name, + const StringSet &build_options_set) const +{ + // Find which program contains the kernel + auto kernel_program_it = _kernel_program_map.find(kernel_name); + + if (_kernel_program_map.end() == kernel_program_it) + { + ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); + } + std::string concat_str; + + if (fp16_supported()) + { + concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; + } + + if (get_cl_version(_device) == CLVersion::CL20) + { + concat_str += " -cl-std=CL2.0 "; + } + else if (arm_non_uniform_workgroup_supported(_device)) + { + concat_str += " -cl-arm-non-uniform-work-group-size "; + } + else + { + ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); + } + + // Check if the program has been built before with same build options. + const std::string program_name = kernel_program_it->second; + const std::string build_options = stringify_set(build_options_set) + concat_str; + + const std::string built_program_name = program_name + "_" + build_options; + auto built_program_it = _built_programs_map.find(built_program_name); + + cl::Program cl_program; + + if (_built_programs_map.end() != built_program_it) + { + // If program has been built, retrieve to create kernel from it + cl_program = built_program_it->second; + } + else + { + // Get program + Program program = load_program(program_name); + + // Build program + cl_program = program.build(build_options); + + // Add built program to internal map + _built_programs_map.emplace(built_program_name, cl_program); + } + + // Create and return kernel + return Kernel(kernel_name, cl_program); +} + +void CLKernelLibraryEx::add_built_program(const std::string &built_program_name, + cl::Program program) +{ + _built_programs_map.emplace(built_program_name, program); +} + +bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); } + +bool CLKernelLibraryEx::int64_base_atomics_supported() const +{ + return device_supports_extension(_device, "cl_khr_int64_base_atomics"); +} + +const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const +{ + const auto program_it = _programs_map.find(program_name); + + if (program_it != _programs_map.end()) + { + return program_it->second; + } + + Program program; + +#ifdef EMBEDDED_KERNELS + const auto program_source_it = _program_source_map.find(program_name); + + if (_program_source_map.end() == program_source_it) + { + ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); + } + + program = Program(_context, program_name, program_source_it->second); +#else /* EMBEDDED_KERNELS */ + // Check for binary + std::string source_name = _kernel_path + program_name; + std::string binary_name = source_name + "bin"; + + if (std::ifstream(binary_name).is_open()) + { + const std::string program_binary = read_file(binary_name, true); + program = Program(_context, _device, program_name, + std::vector<unsigned char>(program_binary.begin(), program_binary.end())); + } + else if (std::ifstream(source_name).is_open()) + { + program = Program(_context, program_name, read_file(source_name, false)); + } + else + { + ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str()); + } +#endif /* EMBEDDED_KERNELS */ + + // Insert program to program map + const auto new_program = _programs_map.emplace(program_name, std::move(program)); + + return new_program.first->second; +} + +std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const +{ + std::string concat_set; + +#ifndef EMBEDDED_KERNELS + concat_set += "-I" + _kernel_path + " "; +#endif /* EMBEDDED_KERNELS */ + + // Concatenate set + for (const auto &el : s) + { + concat_set += " " + el; + } + + return concat_set; +} + +std::string CLKernelLibraryEx::get_program_source(const std::string &program_name) +{ + const auto program_source_it = _program_source_map.find(program_name); + + if (program_source_it == _program_source_map.end()) + { + ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); + } + + return program_source_it->second; +} + +size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const +{ + size_t result; + + size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); + ARM_COMPUTE_ERROR_ON_MSG( + err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_UNUSED(err); + + return result; +} + +cl::NDRange CLKernelLibraryEx::default_ndrange() const +{ + // GPUTarget _target = get_target_from_device(_device); + cl::Device device = cl::Device::getDefault(); + GPUTarget _target = get_target_from_device(device); + cl::NDRange default_range; + + switch (_target) + { + case GPUTarget::MIDGARD: + case GPUTarget::T600: + case GPUTarget::T700: + case GPUTarget::T800: + default_range = cl::NDRange(128u, 1); + break; + default: + default_range = cl::NullRange; + } + + return default_range; +} + +std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl new file mode 100644 index 000000000..2a6dfc91f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform arg_max/arg_min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. + * e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: + * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element + * in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension + * (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. + * Supported data types: U32 + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ + +__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + DATA_TYPE tval = value; + int idx = 0; + for (int i = 1; i < dim; ++i) + { + indices[axis] = i; + +#if OP_CODE == 1 // ArgMax + value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); +#elif OP_CODE == 2 // ArgMin + value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); +#else + return; + +#endif + + if (tval != value) + { + idx = indices[axis]; + tval = value; + } + } + + *((__global uint *)out.ptr) = idx; +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl new file mode 100644 index 000000000..77e239f55 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers_asymm.h" + +#ifdef SATURATE +#define ADD(x, y) add_sat((x), (y)) +#define SUB(x, y) sub_sat((x), (y)) +#else /* SATURATE */ +#define ADD(x, y) (x) + (y) +#define SUB(x, y) (x) - (y) +#endif /* SATURATE */ + +/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8 + * + * The following computations will be performed: + * + * -# Add offset terms to inputs + -# Get scaled value of two inputs + * -# Add inputs + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Shift the int32 accumulator by result_shift + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The inputs and output data types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The number of bits to shift left of input tensors must be passed at compile time using + * -DLEFT_SHIFT + * @attention The offset, scalar scale factor and number of bits to shift right of input tensors + * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, + -DIN2_OFFSET, + * -RIN2_MULT_INT and -DIN2_SHIFT + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and + -DRESULT_SHIFT + * + * @attention The input and output data_types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The inputs and output scale information of qasymm8 need to be passed at compile time + * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: + * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f + * @attention The inputs and output scale offset need to be passed at compile time using + * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: + * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise + * wrapping policy will be used. + * + * @param[in] in1_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] in1_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: + * QASYMM8 + * @param[in] in2_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] out_ptr Pointer to the destination tensor. + * Supported data types: QASYMM8 + * @param[in] out_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * tensor + */ +__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + + // Get scaled value of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + + VEC_DATA_TYPE(int, 16) + left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); + VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; + VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; + + VEC_DATA_TYPE(int, 16) + scaled_in1_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); + VEC_DATA_TYPE(int, 16) + scaled_in2_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); + + // Add inputs and multiply with a multiplier smaller than 1 + VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; + VEC_DATA_TYPE(int, 16) + out_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + + // TODO: Apply min-max BOUND to support fuse with relu. + /* + #if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); + #endif // defined(MIN_BOUND) + #if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); + #endif // defined(MAX_BOUND) + */ + + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl new file mode 100644 index 000000000..8c875516d --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OP_CODE) && defined(DATA_TYPE) +/** returns truth value of the two input tensors for BINARY LOGICAL OP. + * where BINARY LOGICAL OP can be AND, OR. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. + * e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input2_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + */ +__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if OP_CODE == 1 // LOGICAL AND + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) && + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); + +#elif OP_CODE == 2 // LOGICAL OR + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) || + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); + +#else // OP NOT SUPPORTED + return + +#endif +} +#endif // if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl new file mode 100644 index 000000000..2342fda9f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef SCALE +#define SCALE 1.0f +#endif +#ifndef OFFSET +#define OFFSET 0 +#endif +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) +/** Perform a cast operation on an input tensor. + * + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention -DBOOL_INPUT : Whether type of input is bool. + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), + 0, (__global DATA_TYPE_OUT *)output.ptr); + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) + res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); +#if defined(BOOL_INPUT) + VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE)); + VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1); + res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); +#endif // defined(BOOL_INPUT) + + VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr); +} + +/** Perform a cast operation on an QASYMM8 input tensor. + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of input should be given as a preprocessor argument using + * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); + + VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; + VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; + + VSTORE(VEC_SIZE) + (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)output.ptr); +} + +/** Perform a cast operation on an QASYMM8 output tensor. + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of output should be given as a preprocessor argument using + * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); + + VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; + VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); + + VSTORE(VEC_SIZE) + (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)output.ptr); +} +#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl new file mode 100644 index 000000000..e005322f7 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention The value of the z-axis of output tensor should be given as a preprocessor argument + * using -DZ_OUT=size. e.g. -DZ_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); + + int out_index[4] = {0}; + int in_index[4] = {0}; + + out_index[0] = get_global_id(0); // W + out_index[1] = get_global_id(1); // H + out_index[2] = get_global_id(2) % Z_OUT; // C + out_index[3] = get_global_id(2) / Z_OUT; // B + + in_index[0] = out_index[0] / BLOCK_SIZE; + in_index[1] = out_index[1] / BLOCK_SIZE; + in_index[2] = out_index[2] + + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; + in_index[3] = out_index[3]; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +/** Perform space to depth rearrangement of tensor (NHWC) + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention The value of the z-axis of output tensor should be given as a preprocessor argument + * using -DZ_OUT=size. e.g. -DZ_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); + + int out_index[4] = {0}; + int in_index[4] = {0}; + + out_index[0] = get_global_id(0); // C + out_index[1] = get_global_id(1); // W + out_index[2] = get_global_id(2) % Z_OUT; // H + out_index[3] = get_global_id(2) / Z_OUT; // B + + in_index[0] = out_index[0] + + ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT; + in_index[1] = out_index[1] / BLOCK_SIZE; + in_index[2] = out_index[2] / BLOCK_SIZE; + in_index[3] = out_index[3]; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl new file mode 100644 index 000000000..dd8cb6d93 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform embedding_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector + */ + +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + // lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl new file mode 100644 index 000000000..09f776156 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) + +/** Performs the Gather operation along the chosen axis + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/U16/S16/U32/S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] input_stride_z Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] input_stride_w Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_w input_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination + * tensor + */ +__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices), + TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2) % OUTPUT_DIM_Z; + const int pw = get_global_id(2) / OUTPUT_DIM_Z; + + const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z); + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); + +#if AXIS == 0 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0); +#endif +#elif AXIS == 1 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0); +#endif +#elif AXIS == 2 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0); +#endif +#elif AXIS == 3 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index); +#endif +#endif // AXIS + + *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr); +} + +#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl new file mode 100644 index 000000000..73f29e3e5 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform hashtable_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector + */ +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; + + if (lup_id[NUM_DIMS - 1] < 0) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); + return; + } + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h new file mode 100644 index 000000000..0e123ae0a --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPER_H +#define ARM_COMPUTE_HELPER_H + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) +#pragma OPENCL EXTENSION cl_arm_printf : enable +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) + +#define EXPAND(x) x + +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + +#define VLOAD_STR(size) vload##size +#define VLOAD(size) VLOAD_STR(size) + +#define VSTORE_STR(size) vstore##size +#define VSTORE(size) VSTORE_STR(size) + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CL_VEC_DATA_TYPE_STR(type, size) type##size +#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR(x, type) (convert_##type((x))) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) + +#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ + uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR4D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ + uint name##_step_w, uint name##_offset_first_element_in_bytes + +#define CONVERT_TO_VECTOR_STRUCT(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x) + +#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0) + +#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z, name##_stride_w, name##_step_w, mod_size) + +#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \ + mod_size) + +/** Structure to hold Vector information */ +typedef struct Vector +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ +} Vector; + +/** Structure to hold Image information */ +typedef struct Image +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ +} Image; + +/** Structure to hold 3D tensor information */ +typedef struct Tensor3D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ +} Tensor3D; + +/** Structure to hold 4D tensor information */ +typedef struct Tensor4D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ + int stride_w; /**< Stride of the image in W dimension (in bytes) */ +} Tensor4D; + +/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector + * @param[in] stride_x Stride of the vector in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * + * @return An image object + */ +inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x) +{ + Vector vector = { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + }; + vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; + return vector; +} + +/** Wrap image information into an Image structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * + * @return An image object + */ +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += + img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + return img; +} + +/** Wrap 3D tensor information into an image structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, + uint step_y, uint stride_z, uint step_z) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return img; +} + +/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z) +{ + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return tensor; +} + +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z, uint stride_w, uint step_w, uint mod_size) +{ + Tensor4D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w}; + + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + + (get_global_id(2) / mod_size) * step_w; + return tensor; +} + +/** Get the pointer position of a Vector + * + * @param[in] vec Pointer to the starting position of the buffer + * @param[in] x Relative X position + */ +inline __global const uchar *vector_offset(const Vector *vec, int x) +{ + return vec->ptr + x * vec->stride_x; +} + +/** Get the pointer position of a Image + * + * @param[in] img Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + */ +inline __global uchar *offset(const Image *img, int x, int y) +{ + return img->ptr + x * img->stride_x + y * img->stride_y; +} + +/** Get the pointer position of a Tensor3D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + */ +inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; +} + +/** Get the pointer position of a Tensor4D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + * @param[in] w Relative W position + */ +inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + + w * tensor->stride_w; +} + +#endif // _HELPER_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h new file mode 100644 index 000000000..c39138caa --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPERS_ASYMM_H +#define ARM_COMPUTE_HELPERS_ASYMM_H + +#include "helpers.h" + +/** Correctly-rounded-to-nearest division by a power-of-two. + * + * @param[in] size Size of vector. + * + * @return Correctly-rounded-to-nearest division by a power-of-two. + */ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + VEC_DATA_TYPE(int, size) \ + mask = (1 << exponent) - 1; \ + const VEC_DATA_TYPE(int, size) zero = 0; \ + const VEC_DATA_TYPE(int, size) one = 1; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ + } + +/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), + * rounding to the nearest value, and saturating -1 * -1 to the maximum value. + * + * @param[in] size Size of vector. + * + * @return Product of two fixed-point numbers. + */ +#define ASYMM_MULT_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(int, size) \ + overflow = a == b && a == INT_MIN; \ + VEC_DATA_TYPE(long, size) \ + a_64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ + /* COMPMID-907 */ \ + VEC_DATA_TYPE(int, size) \ + ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ + return select(ab_x2_high32, INT_MAX, overflow); \ + } + +/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ + a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = \ + ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + \ + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ + } + +/** Each bit of the result is set to the corresponding bit of either then_val or + * else_val depending on whether the corresponding bit of if_mask is set. + * Equivalent to the VBSL instruction in ARM NEON. + * + * @param[in] size Size of vector. + * + * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding + * bit in @p if_mask is set or not. + */ +#define ASYMM_SELECT_USING_MASK_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \ + VEC_DATA_TYPE(int, size) then_val, \ + VEC_DATA_TYPE(int, size) else_val) \ + { \ + return (if_mask & then_val) ^ (~if_mask & else_val); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is zero. + */ +#define ASYMM_MASK_IF_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a == 0); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is non-zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is non zero. + */ +#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a != 0); \ + } + +#define EXP_BARREL_SHIFTER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ + VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + { \ + if (k_integer_bits > exponent) \ + { \ + const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ + return ASYMM_SELECT_USING_MASK( \ + ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ + } \ + \ + return result; \ + } + +/** Calculates \f$ exp(x) \f$ for x < 0. + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + { \ + const int k_fractional_bits = 31 - k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + k_one_quarter = 1 << (k_fractional_bits - 2); \ + VEC_DATA_TYPE(int, size) \ + mask = k_one_quarter - 1; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ + a_mod_quarter_minus_one_quarter_scaled, size); \ + VEC_DATA_TYPE(int, size) \ + remainder = a_mod_quarter_minus_one_quarter - a; \ + \ + result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ + size); \ + result = \ + EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + \ + if (k_integer_bits > 5) \ + { \ + const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ + result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ + } + +/** Calculates the product of a integer value by a power of two, with either a positive exponent + * (equivalent to an arithmetic left shift, saturating) or a negative exponent + * (equivalent to an arithmetic right shift, rounding to nearest). + * + * @param[in] size Size of vector. + * + * @return Arithmetic left or right shift. + */ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ + } + +/** Calculates (a+b)/2, rounded to the nearest integer. + * Equivalent to VRHADD in the ARM NEON instruction set. + * + * @param[in] size Size of vector. + * + * @return (a+b)/2, rounded to the nearest integer. + */ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, sum >= 0); \ + return convert_int##size((sum + sign) / 2); \ + } + +/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ + VEC_DATA_TYPE(int, size) \ + half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ + const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ + const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ + VEC_DATA_TYPE(int, size) \ + x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ + for (int i = 0; i < 3; i++) \ + { \ + VEC_DATA_TYPE(int, size) \ + half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ + VEC_DATA_TYPE(int, size) \ + one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ + VEC_DATA_TYPE(int, size) \ + tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ + x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ + } \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ + } + +/** Considering the integer value as fixed-point, change the number of integer bits and update value + * accordingly. + * + * @param[in] size Size of vector. + * + * @return Rescaled value. + */ +#define ASYMM_RESCALE_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \ + int src_integer_bits, int dst_integer_bits) \ + { \ + int exponent = src_integer_bits - dst_integer_bits; \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ + } + +#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ + asymm_rounding_divide_by_POW2_##size(x, exponent) +#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) +#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ + asymm_select_using_mask##size(if_mask, then_val, else_val) +#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) +#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) +#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder, size) \ + exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder) +#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \ + asymm_exp_on_negative_values##size(a, k_integer_bits) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(a) +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ + asymm_saturating_rounding_mult_by_pow2##size(x, exponent) +#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) +#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) + +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) + +ASYMM_MULT_IMPL(2) +ASYMM_MULT_IMPL(4) +ASYMM_MULT_IMPL(8) +ASYMM_MULT_IMPL(16) + +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) + +ASYMM_SELECT_USING_MASK_IMPL(2) +ASYMM_SELECT_USING_MASK_IMPL(4) +ASYMM_SELECT_USING_MASK_IMPL(8) +ASYMM_SELECT_USING_MASK_IMPL(16) + +ASYMM_MASK_IF_ZERO_IMPL(2) +ASYMM_MASK_IF_ZERO_IMPL(4) +ASYMM_MASK_IF_ZERO_IMPL(8) +ASYMM_MASK_IF_ZERO_IMPL(16) + +ASYMM_MASK_IF_NON_ZERO_IMPL(2) +ASYMM_MASK_IF_NON_ZERO_IMPL(4) +ASYMM_MASK_IF_NON_ZERO_IMPL(8) +ASYMM_MASK_IF_NON_ZERO_IMPL(16) + +EXP_BARREL_SHIFTER_IMPL(2) +EXP_BARREL_SHIFTER_IMPL(4) +EXP_BARREL_SHIFTER_IMPL(8) +EXP_BARREL_SHIFTER_IMPL(16) + +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) + +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) + +ASYMM_ROUNDING_HALF_SUM_IMPL(2) +ASYMM_ROUNDING_HALF_SUM_IMPL(4) +ASYMM_ROUNDING_HALF_SUM_IMPL(8) +ASYMM_ROUNDING_HALF_SUM_IMPL(16) + +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) + +ASYMM_RESCALE_IMPL(2) +ASYMM_RESCALE_IMPL(4) +ASYMM_RESCALE_IMPL(8) +ASYMM_RESCALE_IMPL(16) + +#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl new file mode 100644 index 000000000..1d96150f8 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) +/** This function normalizes the input 2D tensor across the first dimension with respect to mean and + * standard deviation of the same dimension. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. + * -DDATA_TYPE=float + * @attention Normalization epsilon parameter should be given as a preprocessor argument with + * -DEPSILON=value. e.g. -DEPSILON=0.001f + * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, + * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported + * data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first + * source tensor + * @param[out] output_ptr (Optional) Pointer to the destination tensor. + * Supported data types: same as @p input_ptr + * @param[in] output_stride_x (Optional) Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y (Optional) output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z (Optional) output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in + * the destination tensor + * @param[in] gamma_ptr (Optional) Pointer to the gamma tensor. + * Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x (Optional) Stride of the gamma tensor in X + * dimension (in bytes) + * @param[in] gamma_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in + * the gamma tensor + * @param[in] beta_ptr (Optional) Pointer to the beta tensor. Supported + * data types: same as @p input_ptr + * @param[in] beta_stride_x (Optional) Stride of the beta tensor in X + * dimension (in bytes) + * @param[in] beta_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in + * the beta tensor + */ +__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR4D_DECLARATION(output) +#endif /* IN_PLACE */ +#ifdef GAMMA + , + VECTOR_DECLARATION(gamma) +#endif // GAMMA +#ifdef BETA + , + VECTOR_DECLARATION(beta) +#endif // BETA + ) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); +#ifndef IN_PLACE + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +#endif /* IN_PLACE */ + + float sum = 0.f; + float sum_sq = 0.f; + +#if defined(NHWC) + + const int ch = get_global_id(0); // Current channel + const int batch = get_global_id(2); // Current batch + const int elements_plane = DIM_Y * DIM_Z; + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch)); + sum += data; + sum_sq += data * data; + } + } + +#else // !defined(NHWC) + const int ch = get_global_id(2) % DIM_Z; // Current channel + const int batch = get_global_id(2) / DIM_Z; // Current batch + const int elements_plane = DIM_X * DIM_Y; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum = 0.f; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum_sq = 0.f; + // Calculate partial sum + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + // Load data + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum += data; + part_sum_sq += data * data; + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum.s0 += data; + part_sum_sq.s0 += data * data; + } + } +// Perform reduction +#if VEC_SIZE > 8 + part_sum.s01234567 += part_sum.s89abcdef; + part_sum_sq.s01234567 += part_sum_sq.s89abcdef; +#endif // VEC_SIZE > 8 +#if VEC_SIZE > 4 + part_sum.s0123 += part_sum.s4567; + part_sum_sq.s0123 += part_sum_sq.s4567; +#endif // VEC_SIZE > 4 +#if VEC_SIZE > 2 + part_sum.s01 += part_sum.s23; + part_sum_sq.s01 += part_sum_sq.s23; +#endif // VEC_SIZE > 2 + part_sum.s0 += part_sum.s1; + part_sum_sq.s0 += part_sum_sq.s1; + + sum = (float)part_sum.s0; + sum_sq = (float)part_sum_sq.s0; + +#endif // defined(NHWC) + + const float mean_float = (sum / elements_plane); + const DATA_TYPE mean = (DATA_TYPE)mean_float; + const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float); +#if defined(GAMMA) + const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON); + const DATA_TYPE multip = (DATA_TYPE)multip_float; +#else // !defined(GAMMA) + const DATA_TYPE multip = (DATA_TYPE)0; +#endif // defined(GAMMA) +#if defined(BETA) + const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch); +#else // !defined(BETA) + const DATA_TYPE beta = 0; +#endif // defined(BETA) + +#if defined(NHWC) + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } + +#else // !defined(NHWC) + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, input_address); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res = (data - mean) * multip + beta; + VSTORE(VEC_SIZE) + (res, 0, output_address); + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } +#endif // defined(NHWC) +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) */ diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl new file mode 100644 index 000000000..4aa7883c3 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Performs a negation of input tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * + * @param[in] in_ptr Pointer to the source image. Supported data types: + * S16/S32/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination + * image + * + */ +__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl new file mode 100644 index 000000000..2074d3ceb --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers_asymm.h" + +#ifdef SATURATE +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) +#else /* SATURATE */ +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) +#endif /* SATURATE */ +#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) + +#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of + * GEMMLowp to QASYMM8 + * + * The following computations will be performed by the kernel: + * + * -# Add offset terms to inputs + * -# Multiply inputs + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Shift the int32 accumulator by result_shift + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The inputs and output data types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and + * -DIN2_OFFSET + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and + * -DRESULT_SHIFT + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * image + * @param[in] scale Float scaling factor. Supported data types: F32 + */ +__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), const float scale) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + + // Perform multiplication of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; + + // Multiply with a multiplier smaller than 1 + out_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + + // TODO: Apply min-max BOUND to support fuse with relu. + /* + #if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); + #endif // defined(MIN_BOUND) + #if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); + #endif // defined(MAX_BOUND) + */ + + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl new file mode 100644 index 000000000..62a8901f6 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data + * types : F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] alpha_ptr Pointer to the source image. Supported Data + * types : F16/F32 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source + * image + * + * @param[out] output_ptr Pointer to the destination image. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 + ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) + : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), + 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl new file mode 100644 index 000000000..5e0abd585 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" +#define SUB(x, y) (x) - (y) + +#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \ + defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) +#define SELECT_TYPE VEC_INT + +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. + * -DDATA_TYPE_IN=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note Can only take uchar data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data + * types : QASYMM8 + * @param[in] input1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] alpha_ptr Pointer to the source image. Supported Data + * types : QASYMM8 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); + VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); + + in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN)); + alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA)); + + const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN); + const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA); + const VEC_FLOAT outf32 = + select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE)); + const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); + const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); + + VSTORE(VEC_SIZE) + (res, 0, (__global uchar *)output.ptr); +} + +#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && + // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl new file mode 100644 index 000000000..d7ea2e2c4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform reduce max/min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + for (int i = 1; i < dim; ++i) + { + indices[axis] = i; + +#if OP_CODE == 1 // REDUCE_MAX + value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); + +#elif OP_CODE == 2 // REDUCE_MIN + value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); + +#else // OP NOT SUPPORTED + return; + +#endif + } + + *((__global DATA_TYPE *)out.ptr) = value; +} + +/** Perform reduce sum/mean + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE sum_value = (DATA_TYPE)0; + for (int i = 0; i < dim; ++i) + { + indices[axis] = i; + sum_value += *( + (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + } + +#if OP_CODE == 3 // REDUCE_SUM + *((__global DATA_TYPE *)out.ptr) = sum_value; + +#elif OP_CODE == 4 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); + +#else // OP NOT SUPPORTED + return; + +#endif +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl new file mode 100644 index 000000000..7367da7fb --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \ + defined(WIDTH_IN) && defined(ZERO_VALUE) +/** Perform space to batch with input of 4D and NCHW format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. + * e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along + * X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along + * Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along + * Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along + * W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: same as @p + * input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements + * along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of + * elements along X processed per workitem + * (in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of + * elements along Y processed per workitem + * (in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = { + 0, + }; + in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[1] = get_global_id(1) * block_size_y + shift_y - + *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[2] = get_global_id(2) % DEPTH_OUT; + in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; + + if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); + } +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && + // defined(WIDTH_IN) && defined(ZERO_VALUE) + +#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \ + defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) +/** Perform space to batch with input of 4D and NHWC format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using + * -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. + * e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along + * X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along + * Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along + * Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along + * W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: same as @p + * input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements + * along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of + * elements along X processed per workitem + * (in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of + * elements along Y processed per workitem + * (in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = { + 0, + }; + in_index[0] = get_global_id(0) * VEC_SIZE; + in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - + *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; + + if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) + { + VSTORE(VEC_SIZE) + ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); + } + else + { + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], + in_index[2], in_index[3])), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); + } +} + +#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && + // defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl new file mode 100644 index 000000000..a26e762e8 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. + * e.g. -DDEPTH_IN=16 + * @attention The value of the z-axis of input tensor depth should be given as a preprocessor + * argument using -DZ_IN=size. e.g. -DZ_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4] = {0}; + int in_index[4] = {0}; + + in_index[0] = get_global_id(0); // W + in_index[1] = get_global_id(1); // H + in_index[2] = get_global_id(2) % Z_IN; // C + in_index[3] = get_global_id(2) / Z_IN; // B + + out_index[0] = in_index[0] / BLOCK_SIZE; + out_index[1] = in_index[1] / BLOCK_SIZE; + out_index[2] = + in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; + out_index[3] = in_index[3]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], + out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) + +#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. + * e.g. -DDEPTH_IN=16 + * @attention The value of the z-axis of input tensor depth should be given as a preprocessor + * argument using -DZ_IN=size. e.g. -DZ_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4] = {0}; + int in_index[4] = {0}; + + in_index[0] = get_global_id(0); // C + in_index[1] = get_global_id(1); // W + in_index[2] = get_global_id(2) % Z_IN; // H + in_index[3] = get_global_id(2) / Z_IN; // B + + out_index[0] = + in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN; + out_index[1] = in_index[1] / BLOCK_SIZE; + out_index[2] = in_index[2] / BLOCK_SIZE; + out_index[3] = in_index[3]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], + out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl new file mode 100644 index 000000000..50472e4f9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.h" + +__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf, + __global int *in_ind_buf, const int n) +{ + int gid = get_global_id(0); + int lws = get_local_size(0); + int groups = get_num_groups(0); + int gws = lws * groups; + int iter = n / gws; + + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + + for (int i = 0; i < iter; ++i) + { + int idx = i * gws + gid; + in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x); + in_ind_buf[idx] = idx; + } +} + +__kernel void topkv2_find_first_negative(__global float *out_key_buf, + __global int *first_negative_idx, int n) +{ + int gid = get_global_id(0); + + if (gid == n - 1) + { + // if the last item is positive, the first negative index is n. + if (out_key_buf[gid] > 0.f) + *first_negative_idx = n; + } + else if (gid == 0) + { + // if the first item is negative, set it 0. + if (out_key_buf[gid] < 0.f) + *first_negative_idx = 0; + } + else + { + // if its left is positive and it is negative, then it is the first negative item. + if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f) + *first_negative_idx = gid; + } +} + +__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf, + __global float *in_ind_buf, __global float *out_ind_buf, + __global int *first_negative_idx, int n) +{ + int gid = get_global_id(0); + + int num_negs = n - *first_negative_idx; + int in_idx; + + if (gid < num_negs) + { + in_idx = n - 1 - gid; + } + else + { + in_idx = gid - num_negs; + } + + out_key_buf[gid] = in_key_buf[in_idx]; + out_ind_buf[gid] = in_ind_buf[in_idx]; +} + +__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices), + __global float *out_key_buf, __global int *out_ind_buf, int n) +{ + int gid = get_global_id(0); + + Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values); + Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); + + int idx = n - 1 - gid; + + *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; + *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl new file mode 100644 index 000000000..9594daf19 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.h" + +__global inline float *get_vec_elem(Vector *vec, int idx) +{ + return (__global float *)(vec->ptr + idx * vec->stride_x); +} + +__global inline int *get_vec_elem_int(Vector *vec, int idx) +{ + return (__global int *)(vec->ptr + idx * vec->stride_x); +} + +// A utility function to swap two elements +void swap(__global float *a, __global float *b) +{ + float t = *a; + *a = *b; + *b = t; +} + +void swap_idx(__global int *a, __global int *b) +{ + int t = *a; + *a = *b; + *b = t; +} + +/* This function is same in both iterative and recursive*/ +int partition(Vector *arr, __global int *indices, int l, int h) +{ + float x = *get_vec_elem(arr, h); + int i = (l - 1); + + for (int j = l; j <= h - 1; j++) + { + if (*get_vec_elem(arr, j) >= x) + { + i++; + swap(get_vec_elem(arr, i), get_vec_elem(arr, j)); + swap_idx(&indices[i], &indices[j]); + } + } + swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); + swap_idx(&indices[i + 1], &indices[h]); + return (i + 1); +} + +/* A[] --> Array to be sorted, + l --> Starting index, + h --> Ending index */ +void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h) +{ + // Create an auxiliary stack + + // initialize top of stack + int top = -1; + + // push initial values of l and h to stack + stack[++top] = l; + stack[++top] = h; + + // Keep popping from stack while is not empty + while (top >= 0) + { + // Pop h and l + h = stack[top--]; + l = stack[top--]; + + // Set pivot element at its correct position + // in sorted array + int p = partition(arr, indices, l, h); + + // If there are elements on left side of pivot, + // then push left side to stack + if (p - 1 > l) + { + stack[++top] = l; + stack[++top] = p - 1; + } + + // If there are elements on right side of pivot, + // then push right side to stack + if (p + 1 < h) + { + stack[++top] = p + 1; + stack[++top] = h; + } + } +} + +__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values), + VECTOR_DECLARATION(topk_indices), __global int *indices, + __global int *temp_stack, int k, int n) +{ + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); + Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); + + for (int i = 0; i < n; ++i) + { + indices[i] = i; + } + + quickSortIterative(&input, indices, temp_stack, 0, n - 1); + + // extract k items. + for (int i = 0; i < k; ++i) + { + *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); + *get_vec_elem_int(&topk_indices, i) = indices[i]; + } +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl new file mode 100644 index 000000000..f6830d229 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// reference: +// https://code.google.com/archive/p/ocl-radix-sort/source/default/source +// OpenCL kernel sources for the CLRadixSort class +// the #include does not exist in OpenCL +// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr +// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html +// if you find this software usefull you can cite the following work in your reports or articles: +// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. +// http://hal.archives-ouvertes.fr/hal-00596730 + +// Reference for floating point radix sort: +// http://www.codercorner.com/RadixSortRevisited.htm + +// compute the histogram for each radix and each virtual processor for the pass +__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms, + const int pass, __local int *loc_histo, const int n) +{ + int it = get_local_id(0); // i local number of the processor + int ig = get_global_id(0); // global number = i + g I + + int gr = get_group_id(0); // g group number + + int groups = get_num_groups(0); + int items = get_local_size(0); + + // set the local histograms to zero + for (int ir = 0; ir < _RADIX; ir++) + { + loc_histo[ir * items + it] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // range of keys that are analyzed by the work item + int size = n / groups / items; // size of the sub-list + int start = ig * size; // beginning of the sub-list + + unsigned int key; + int shortkey, k; + + // compute the index + // the computation depends on the transposition + for (int j = 0; j < size; j++) + { +#ifdef TRANSPOSE + k = groups * items * j + ig; +#else + k = j + start; +#endif + + key = *((__global unsigned int *)(in_key_buf + k)); + + // extract the group of _BITS bits of the pass + // the result is in the range 0.._RADIX-1 + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); + + // increment the local histogram + loc_histo[shortkey * items + it]++; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // copy the local histogram to the global one + for (int ir = 0; ir < _RADIX; ir++) + { + d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; + } + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// initial transpose of the list for improving +// coalescent memory access +__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol, + const int nbrow, const __global int *inperm, __global int *outperm, + __local int *blockmat, __local int *blockperm, const int tilesize) +{ + + int i0 = get_global_id(0) * tilesize; // first row index + int j = get_global_id(1); // column index + + int jloc = get_local_id(1); // local column index + + // fill the cache + for (int iloc = 0; iloc < tilesize; iloc++) + { + int k = (i0 + iloc) * nbcol + j; // position in the matrix + blockmat[iloc * tilesize + jloc] = invect[k]; +#ifdef PERMUT + blockperm[iloc * tilesize + jloc] = inperm[k]; +#endif + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // first row index in the transpose + int j0 = get_group_id(1) * tilesize; + + // put the cache at the good place + for (int iloc = 0; iloc < tilesize; iloc++) + { + int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose + outvect[kt] = blockmat[jloc * tilesize + iloc]; +#ifdef PERMUT + outperm[kt] = blockperm[jloc * tilesize + iloc]; +#endif + } +} + +// each virtual processor reorders its data using the scanned histogram +__kernel void radixsort_reorder(__global float *in_key, __global float *out_key, + __global int *d_Histograms, const int pass, + __global int *indices_in, __global int *indices_out, + __local int *loc_histo, const int n) +{ + + int it = get_local_id(0); + int ig = get_global_id(0); + + int gr = get_group_id(0); + int groups = get_num_groups(0); + int items = get_local_size(0); + + int start = ig * (n / groups / items); + int size = n / groups / items; + + // take the histogram in the cache + for (int ir = 0; ir < _RADIX; ir++) + { + loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int newpos, shortkey, k, newpost; + unsigned int key; + + for (int j = 0; j < size; j++) + { +#ifdef TRANSPOSE + k = groups * items * j + ig; +#else + k = j + start; +#endif + float org_value = in_key[k]; + key = *(__global unsigned int *)(in_key + k); + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); + + newpos = loc_histo[shortkey * items + it]; + +#ifdef TRANSPOSE + int ignew, jnew; + ignew = newpos / (n / groups / items); + jnew = newpos % (n / groups / items); + newpost = jnew * (groups * items) + ignew; +#else + newpost = newpos; +#endif + + // d_outKeys[newpost]= key; // killing line !!! + out_key[newpost] = org_value; + +#ifdef PERMUT + indices_out[newpost] = indices_in[k]; +#endif + + newpos++; + loc_histo[shortkey * items + it] = newpos; + } +} + +// perform a parallel prefix sum (a scan) on the local histograms +// (see Blelloch 1990) each workitem worries about two memories +// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html +__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp, + __global int *globsum) +{ + int it = get_local_id(0); + int ig = get_global_id(0); + int decale = 1; + int n = get_local_size(0) * 2; + int gr = get_group_id(0); + + // load input into local memory + // up sweep phase + temp[2 * it] = histo[2 * ig]; + temp[2 * it + 1] = histo[2 * ig + 1]; + + // parallel prefix sum (algorithm of Blelloch 1990) + for (int d = n >> 1; d > 0; d >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; + temp[bi] += temp[ai]; + } + decale *= 2; + } + + // store the last element in the global sum vector + // (maybe used in the next step for constructing the global scan) + // clear the last element + if (it == 0) + { + globsum[gr] = temp[n - 1]; + temp[n - 1] = 0; + } + + // down sweep phase + for (int d = 1; d < n; d *= 2) + { + decale >>= 1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; + + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // write results to device memory + + histo[2 * ig] = temp[2 * it]; + histo[2 * ig + 1] = temp[2 * it + 1]; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// use the global sum for updating the local histograms +// each work item updates two values +__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum) +{ + int ig = get_global_id(0); + int gr = get_group_id(0); + + int s; + + s = globsum[gr]; + + // write results to device memory + histo[2 * ig] += s; + histo[2 * ig + 1] += s; + + barrier(CLK_GLOBAL_MEM_FENCE); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp new file mode 100644 index 000000000..7f4b5b0df --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ArgOperation /*op*/) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) != + output->tensor_shape().num_dimensions(), + "Input's rank is not same with output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); + return Status{}; +} + +} // namespace + +CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel and set op_code based on type of ArgOperation as specified by object op + std::string kernel_name = "arg_op"; + int op_code = 0; + if (op == ArgOperation::MAX) + { + op_code = 1; + } + else if (op == ArgOperation::MIN) + { + op_code = 2; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp new file mode 100644 index 000000000..c14e73634 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, BinaryLogicalOperation op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "binary_logical_op"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + + int op_code = 0; + switch (op) + { + case BinaryLogicalOperation::AND: + op_code = 1; + break; + case BinaryLogicalOperation::OR: + op_code = 2; + break; + default: + throw std::runtime_error("Operation not supported, yet"); + } + + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const ValidRegion &valid_region = broadcast_pair.second; + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLBinaryLogicalOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp new file mode 100644 index 000000000..35f607bd0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLCastKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {} + +void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Set kernel build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + // Create kernel + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + const float scale_in = input->info()->quantization_info().scale; + const int offset_in = input->info()->quantization_info().offset; + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in)); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options())); + } + else if (is_data_type_quantized_asymmetric(output->info()->data_type())) + { + const float scale_in = output->info()->quantization_info().scale; + const int offset_in = output->info()->quantization_info().offset; + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in)); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options())); + } + else + { + build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT"); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast", build_opts.options())); + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp new file mode 100644 index 000000000..2a3433c2b --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +// TODO Use this validation function +#if 0 +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size, + "Output width should be equal to (Input width * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size, + "Output height should be equal to (Input height * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0, + "Input depth should be divisible by (block size * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(2) != input->dimension(2) / (block_size * block_size), + "Output depth should be equal to (Input depth / (block size * block size))"); + + return Status{}; +} +#endif +} // namespace + +CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + // TODO Add validation of data_layout + _input = input; + _output = output; + + // Set kernel build options + auto layout_out = output->info()->data_layout(); + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto depth = output->info()->dimension(index_depth); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth)); + build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..0862b78bf --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + return Status{}; +} + +void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp new file mode 100644 index 000000000..718f615f9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/core/UtilsEx.h" + +using namespace arm_compute; + +namespace +{ + +inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), actual_axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, + ITensorInfo *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + std::unique_ptr<ITensorInfo> output_info = input->clone(); + output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), actual_axis)); + // Output auto initialization if not yet initialized + auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); + + // Create window + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} + +} // namespace + +CLGatherExKernel::CLGatherExKernel() + : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) +{ +} + +void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices, + ICLTensor *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), indices->info(), output->info(), axis)); + + // Configure kernel window + auto win_config = + validate_and_configure_window(input->info(), indices->info(), output->info(), axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input = input; + _output = output; + _indices = indices; + _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions())); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DOUTPUT_DIM_Z=" + + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis)); + build_opts.add_option("-DINDICES_DIM=" + + support::cpp11::to_string(indices->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); + ICLKernel::configure_internal(win_config.second); +} + +Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + indices->clone().get(), + output->clone().get(), axis) + .first); + return Status{}; +} + +void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4); + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, window_collapsed); + add_3D_tensor_argument(idx, _indices, window_collapsed); + add_4D_tensor_argument(idx, _output, window_collapsed); + enqueue(queue, *this, window_collapsed, lws_hint()); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp new file mode 100644 index 000000000..31e98c9a8 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLHashtableLookupKernel::CLHashtableLookupKernel() +{ + // DO NOTHING +} + +Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Output's shape was not set"); + + ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) || + output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + + return Status{}; +} + +void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Make _lookup_indices tensor + _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const_cast<ICLTensor *>(_lookups)->map(queue); + const_cast<ICLTensor *>(_keys)->map(queue); + _hits->map(queue); + _lookup_indices->map(queue); + + // Set values of hits + const int32_t *lookups_buf = + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); + + std::map<int32_t, size_t> key_map; + const size_t keys_num = _keys->info()->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const size_t lookups_num = _lookups->info()->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= lookups_num) + ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices_buf[i] = static_cast<int32_t>(it->second); + hits_buf[i] = static_cast<uint8_t>(1); + } + else + { + lookup_indices_buf[i] = -1; + hits_buf[i] = static_cast<uint8_t>(0); + } + } + + const_cast<ICLTensor *>(_lookups)->unmap(queue); + const_cast<ICLTensor *>(_keys)->unmap(queue); + _hits->unmap(queue); + _lookup_indices->unmap(queue); + + Window win = window.collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..5db414f62 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_UNUSED(gamma); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx() + : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), + _run_in_place(false) +{ +} + +void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, + float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + _run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), + gamma ? gamma->info() : nullptr, + beta ? beta->info() : nullptr, epsilon)); + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon)); + build_opts.add_option_if(gamma, "-DGAMMA"); + build_opts.add_option_if(beta, "-DBETA"); + build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); + build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + ICLKernel::configure_internal(std::get<1>(win_config)); +} + +Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window collapsed_window = window.collapse(window, Window::DimZ); + + // We will process the planes together + if (_input->info()->data_layout() == DataLayout::NCHW) + { + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + } + else + { + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1)); + } + + Window vec_window; + vec_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, collapsed_window); + if (!_run_in_place) + { + add_4D_tensor_argument(idx, _output, collapsed_window); + } + if (_gamma) + { + add_1D_tensor_argument(idx, _gamma, vec_window); + } + if (_beta) + { + add_1D_tensor_argument(idx, _beta, vec_window); + } + + enqueue(queue, *this, collapsed_window, lws_hint()); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp new file mode 100644 index 000000000..ecfe05a51 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + return Status{}; +} + +} // namespace + +CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} + +void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp new file mode 100644 index 000000000..e7d587029 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info())); + + _input = input; + _alpha = alpha; + _output = output; + + // Create kernel + std::string kernel_name = "prelu"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + build_opts.emplace("-DOFF_IN=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_ALPHA=" + + support::cpp11::to_string(alpha->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN=" + + support::cpp11::to_string(input->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_ALPHA=" + + support::cpp11::to_string(alpha->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input->info()->data_type() == DataType::F32 || + alpha->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); + + AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input->info()->tensor_shape(); + const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_input1); + add_3D_tensor_argument(idx, _alpha, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLPReLUKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp new file mode 100644 index 000000000..24e89db28 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; +namespace +{ +// NOTE This is necessary because it is not guaranteed that the axis positions of input and output +// are the same. +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32, DataType::S32); + if (op == ReduceOperation::SUM) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, + "Not support QASYMM8, yet"); + } + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + return Status{}; +} +} // namespace + +CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel name + std::string kernel_name; + int op_code = 0; + if (op == ReduceOperation::MAX) + { + kernel_name = "reduce_min_max"; + op_code = 1; + } + else if (op == ReduceOperation::MIN) + { + kernel_name = "reduce_min_max"; + op_code = 2; + } + else if (op == ReduceOperation::SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 3; + } + else if (op == ReduceOperation::MEAN) + { + kernel_name = "reduce_sum_mean"; + op_code = 4; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + // Support dimensions up to 4 + Window slice_out = window.collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions + // of input and output are the same + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp new file mode 100644 index 000000000..f7836b6cd --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size, + const ITensorInfo *padding_size, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(), + "The number of dimensions of input should be equal to output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(), + "The input and output layouts are different!"); + + // TODO Support other cases + if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4, + "CLSpaceToBatchNDKernel supports dimensions up to 4"); + + if (input->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() +{ + // DO NOTHING +} + +void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info())); + + _input = input; + _block_size = block_size; + _padding_size = padding_size; + _output = output; + + // Set kernel build options + // TODO Support other cases + std::string kernel_name = "space_to_batch_4d"; + std::set<std::string> build_opts; + Window win; + + if (input->info()->data_layout() == DataLayout::NCHW) + { + kernel_name += "_nchw"; + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0))); + + win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + } + else if (input->info()->data_layout() == DataLayout::NHWC) + { + kernel_name += "_nhwc"; + build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + + win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->info()->valid_region()); + + if (window_changed) + { + ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!"); + } + } + else + { + ARM_COMPUTE_ERROR("Unsupported layout"); + } + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + ICLKernel::configure_internal(win); +} + +void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + const_cast<ICLTensor *>(_block_size)->map(queue); + const_cast<ICLTensor *>(_padding_size)->map(queue); + + const size_t num_dimensions = _input->info()->num_dimensions(); + const size_t num_spacial_dimensions = _block_size->info()->dimension(0); + uint32_t batch_size = _input->info()->dimension(num_dimensions - 1); + for (size_t i = 0; i < num_spacial_dimensions; ++i) + { + const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i})); + const int32_t padding_size_pre = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i})); + const int32_t padding_size_post = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i})); + + ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1"); + ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0, + "Padding size should be greater than or equal to 0"); + + if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(i) != + (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + else + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) != + (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) + + padding_size_pre + padding_size_post) / + block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + + batch_size *= block_size; + } + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - 1) != batch_size, + "Output batch size should be equal to input batch size * (multiplication of all block size)"); + + const_cast<ICLTensor *>(_block_size)->unmap(queue); + const_cast<ICLTensor *>(_padding_size)->unmap(queue); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Set block size window + Window win_block = calculate_max_window(*_block_size->info(), Steps()); + + // Set padding size window + Window win_padding = calculate_max_window(*_padding_size->info(), Steps()); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + add_1D_tensor_argument(idx, _block_size, win_block); + add_2D_tensor_argument(idx, _padding_size, win_padding); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp new file mode 100644 index 000000000..b085192a2 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3), + "Input batch should be equal to Output batch"); + + auto layout_out = input->data_layout(); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT); + auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth), + "Output depth should be equal to (input depth * block size *block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) || + (input->dimension(index_height) % block_size), + "Input height and width should be divisible by block size"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (output->dimension(index_width) != (input->dimension(index_width) / block_size)) || + (output->dimension(index_height) != (input->dimension(index_height) / block_size)), + "Output height and width should be equal to " + "input_height/blocksize and input_width/blocksize respectively"); + + return Status{}; +} + +} // namespace + +CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + auto layout_out = input->info()->data_layout(); + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto depth = input->info()->dimension(index_depth); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth)); + build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp new file mode 100644 index 000000000..4f2b388c9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 +namespace arm_compute +{ +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {} + +void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + _topk_values = topk_values; + _topk_indices = topk_indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts)); + + unsigned int idx = 3 * num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *indices); + _kernel.setArg(idx++, *temp_stack); + _kernel.setArg<cl_int>(idx++, k); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, 1, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + add_1D_tensor_argument(idx, _topk_values, window); + add_1D_tensor_argument(idx, _topk_indices, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {} + +void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, + int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); + ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts)); + + unsigned int idx = num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *in_key_buf); + _kernel.setArg(idx++, *in_ind_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +// This kernel makes a histogram of radix for each work item. +CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {} + +void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts)); + + int loc_histo_size = radix * _ITEMS * sizeof(cl_int); + + unsigned int idx = 1; + _kernel.setArg(idx++, *hist_buf); + + idx = 3; + _kernel.setArg(idx++, loc_histo_size, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg<cl_int>(2, _pass); + + cl::NDRange lws = cl::NDRange(_ITEMS, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortScanHistogram::CLRadixSortScanHistogram() {} + +void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {} + +void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, + int bits) +{ + ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *glob_sum_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *temp_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {} + +void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts)); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortReorder::CLRadixSortReorder() + : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), + _out_ind_buf(nullptr) +{ +} + +void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts)); + + unsigned int idx = 2; + _kernel.setArg(idx++, *hist_buf); + + idx = 6; + _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); + cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg(1, *_out_key_buf); + _kernel.setArg<cl_int>(3, _pass); + _kernel.setArg(4, *_in_ind_buf); + _kernel.setArg(5, *_out_ind_buf); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {} + +void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts)); + + unsigned int idx = 1; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_out_key_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() + : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts)); + + unsigned int idx = 4; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_in_key_buf); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_in_ind_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Store::CLTopKV2Store() + : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(k == 0); + ARM_COMPUTE_ERROR_ON(k > n); + + _values = values; + _indices = indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts)); + + unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, k, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) +{ + _out_key_buf = out_key_buf; + _out_ind_buf = out_ind_buf; +} + +void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _values, window); + add_1D_tensor_argument(idx, _indices, window); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +} // namespace arm_compute +#endif // Disable GPU implementation diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp new file mode 100644 index 000000000..6cc8d9d13 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel() + : _input(nullptr), _output(nullptr), _inner_border(), _info() +{ +} + +Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0); + + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c)); + for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1, + "inner_border_right must be smaller that stride_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1, + "inner_border_top must be smaller that stride_y"); + + return Status{}; +} + +void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _input = input; + _output = output; + _inner_border = inner_border; + _info = info; + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate( + input->info(), output->info(), inner_border, info)); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + _kernel = static_cast<cl::Kernel>( + CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options())); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const DataLayout data_layout = _input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + const int out_start_x = _info.pad_left(); + const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right - + _info.pad_right() + _info.stride().first - 1; + const int out_step_x = _info.stride().first; + + const int out_start_y = _inner_border.top + _info.pad_top(); + const int out_end_y = + _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1; + const int out_step_y = _info.stride().second; + + switch (data_layout) + { + case DataLayout::NCHW: + { + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + + Window slice_out = collapsed.first_slice_window_3D(); + slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x)); + slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y)); + + Window slice_in = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (collapsed.slide_window_slice_3D(slice_in) && + collapsed.slide_window_slice_3D(slice_out)); + break; + } + case DataLayout::NHWC: + { + // NOTE: not collapsing in NHWC + Window slice_out = window.first_slice_window_3D(); + slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x)); + slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y)); + + Window slice_in = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data layout"); + } +} diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp new file mode 100644 index 000000000..8ac667ceb --- /dev/null +++ b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <cstddef> +#include <cstdint> + +namespace arm_compute +{ +CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {} + +bool CPPUpsampleKernelEx::is_parallelisable() const { return false; } + +void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _input = input; + _output = output; + _info = info; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICPPKernel::configure(win); +} + +void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); + + // Initialize _scaled_output buffer + const int width_scaled = _output->info()->dimension(0); + const int height_scaled = _output->info()->dimension(1); + const int stride_x = _info.stride().first; + const int stride_y = _info.stride().second; + const int start_x = _info.pad_left(); + const int start_y = _info.pad_top(); + const int end_y = height_scaled - _info.pad_bottom(); + const int end_x = width_scaled - _info.pad_top(); + const size_t element_size = _input->info()->element_size(); + + // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset + const uint8_t fill_value = + _output->info()->data_type() == DataType::QASYMM8 + ? utility::clamp<uint8_t>(_output->info()->quantization_info().offset) + : 0; + // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte + // values in a buffer of uint8_ts + std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value); + + // Create window + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x)); + window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y)); + + // Create iterators + Iterator in(_input, window); + Iterator out(_output, window_out); + + execute_window_loop( + window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp new file mode 100644 index 000000000..4508f5800 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" + +#include <algorithm> +#include "arm_compute/core/Types.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +namespace +{ +void store_quantized_int32(uint8_t *output_ptr, const int32x4x4_t &out) +{ + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +using namespace arm_compute; +template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> +void elementwise_op_templ( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, + OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, + OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, broadcast_value, + output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = + (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = + reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = + reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, + input1_ptr, input2_ptr, output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +} // namespace + +namespace arm_compute +{ + +float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, + const float32x4_t &scale) +{ + qasymm8x16_t x = vld1q_u8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + }}; + return out; +} + +void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, + const float32x4_t &invscale) +{ + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; + store_quantized_int32(output_ptr, out); +} + +float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale) +{ + const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + + const float32x4x4_t broadcast_vector = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16( + vmovl_u8(vget_low_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16( + vmovl_u8(vget_low_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16( + vmovl_u8(vget_high_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16( + vmovl_u8(vget_high_u8(broadcast_value_vec))))), + voffset)), + vscale), + }}; + return broadcast_vector; +} + +void elementwise_op_quantized( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo), + int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, + float32x4_t, float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, + int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + const float output_scale = out->info()->quantization_info().scale; + const int output_offset = out->info()->quantization_info().offset; + + // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from + // zero) + const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale); + + if (is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info(); + const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) { + const auto non_broadcast_input_ptr = + reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = + dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, + invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = + scvt_f32_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo.scale, + non_broadcast_qinfo.offset); + const float bfs = + scvt_f32_qasymm8(broadcast_value, broadcast_qinfo.scale, broadcast_qinfo.offset); + *(output_ptr + x) = + (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, + out->info()->quantization_info()); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset); + const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset); + const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const QuantizationInfo input1_qinfo = in1->info()->quantization_info(); + const QuantizationInfo input2_qinfo = in2->info()->quantization_info(); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = + (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, + output_ptr, voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = + scvt_f32_qasymm8(*(input1_ptr + x), input1_qinfo.scale, input1_qinfo.offset); + const float bfs = + scvt_f32_qasymm8(*(input2_ptr + x), input2_qinfo.scale, input2_qinfo.offset); + *(output_ptr + x) = (*scalar_func)(afs, bfs, out->info()->quantization_info()); + } + }, + input1, input2, output); + } +} + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)) +{ + elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)) +{ + elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp new file mode 100644 index 000000000..d2f42de53 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <algorithm> +#include <arm_neon.h> +#include <map> +#include <string> + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace arm_compute +{ + +template <BinaryLogicalOperation op, typename ScalarType> +inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case BinaryLogicalOperation::AND: + res = a & b; + break; + case BinaryLogicalOperation::OR: + res = a | b; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op, typename VectorType> +inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b) +{ + VectorType res = {0, 0, 0, 0}; + + switch (op) + { + case BinaryLogicalOperation::AND: + res = wrapper::vand(a, b); + break; + case BinaryLogicalOperation::OR: + res = wrapper::vorr(a, b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op> +inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b) +{ + uint8x16x4_t out = {{ + elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]), + elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]), + }}; + return out; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline VectorType elementwise_logic_op_broadcast(const VectorType &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, const ScalarType *input2_ptr, + ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>, + &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>, + &elementwise_logic_op_loop<op, ScalarType, VectorType>); +} + +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( + const ITensor *input1, const ITensor *input2, ITensor *output, + std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) +{ + std::string function_to_call("op_"); + function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; + function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + auto it = map_function.find(function_to_call); + + if (it != map_function.end()) + { + auto func = it->second; + return [func](const ITensor *input1, const ITensor *input2, ITensor *output, + const Window &window) { func(input1, input2, output, window); }; + } + return nullptr; +} + +template <BinaryLogicalOperation op> +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> +configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { + {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, + {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; + + return configure_func(input1, input2, output, map_function); +} + +void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1, + const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info())); + configure_common(input1, input2, output); + switch (op) + { + case BinaryLogicalOperation::AND: + _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output); + break; + case BinaryLogicalOperation::OR: + _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output) +{ + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, + DataType::QASYMM8); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); + + const TensorShape out_shape = + TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output)); + return Status{}; +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp new file mode 100644 index 000000000..7e4fc129b --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECastKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, + DataType::QASYMM8, DataType::U32, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL && + input->data_type() != DataType::U8); + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::QASYMM8, DataType::U32, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32); + + // NECastKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + + return std::make_tuple(Status{}, win); +} + +typedef struct bool8x16 +{ + uint8x16_t val; +} bool8x16_t; + +static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; } + +template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; } +template <> inline uint8x16_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + return vshrq_n_u8(mask, 7); // true -> 1, false -> 0 +} + +template <> inline uint32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const uint32x4x4_t ret = {{ + vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))), + vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const int32x4x4_t ret = {{ + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const float32x4x4_t ret = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), + }}; + + return ret; +} + +template <> inline uint32x4x4_t vcast(const uint8x16_t &v) +{ + const uint32x4x4_t ret = {{ + vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))), + vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const uint8x16_t &v) +{ + const int32x4x4_t ret = {{ + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const uint8x16_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const int32x4x4_t &v) +{ + // Saturate cast + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))), + vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3])))); +} + +template <> inline uint32x4x4_t vcast(const int32x4x4_t &v) +{ + // Saturate cast + const uint32x4x4_t ret = {{ + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const int32x4x4_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]), + vcvtq_f32_s32(v.val[3]), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const uint32x4x4_t &v) +{ + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))), + vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3])))); +} + +template <> inline int32x4x4_t vcast(const uint32x4x4_t &v) +{ + const int32x4x4_t ret = {{ + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const uint32x4x4_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]), + vcvtq_f32_u32(v.val[3]), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const float32x4x4_t &v) +{ + // Saturate cast + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])), + vqmovun_s32(vcvtq_s32_f32(v.val[1])))), + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])), + vqmovun_s32(vcvtq_s32_f32(v.val[3]))))); +} + +template <> inline uint32x4x4_t vcast(const float32x4x4_t &v) +{ + const uint32x4x4_t ret = {{ + vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]), + vcvtq_u32_f32(v.val[3]), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const float32x4x4_t &v) +{ + const int32x4x4_t ret = {{ + vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]), + vcvtq_s32_f32(v.val[3]), + }}; + + return ret; +} + +template <typename T> struct cast_vector; +template <> struct cast_vector<bool> +{ + using type = bool8x16_t; +}; +template <> struct cast_vector<uint8_t> +{ + using type = uint8x16_t; +}; +template <> struct cast_vector<uint32_t> +{ + using type = uint32x4x4_t; +}; +template <> struct cast_vector<int32_t> +{ + using type = int32x4x4_t; +}; +template <> struct cast_vector<float> +{ + using type = float32x4x4_t; +}; + +template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v) +{ + wrapper::vstore(ptr, v); +} + +inline bool8x16_t vloadq(const bool *ptr) +{ + bool8x16_t ret; + ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr)); + return ret; +} + +template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr) +{ + return wrapper::vloadq(ptr); +} + +template <> inline typename cast_vector<bool>::type load_input(const bool *ptr) +{ + return vloadq(ptr); +} + +template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr) +{ + return vld4q_u32(ptr); +} + +template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr) +{ + return vld4q_s32(ptr); +} + +template <> inline typename cast_vector<float>::type load_input(const float *ptr) +{ + return vld4q_f32(ptr); +} + +template <typename T> inline T get_value(const T *ptr) { return *ptr; } + +template <> inline bool get_value(const bool *ptr) +{ + bool ret = (*ptr != 0); + return ret; +} + +template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window) +{ + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + using from_vector = typename cast_vector<FromT>::type; + const from_vector vin = load_input(in_ptr + x); + + switch (output->info()->data_type()) + { + case DataType::U8: + { + using to_vector = typename cast_vector<uint8_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::QASYMM8: + { + using to_vector = typename cast_vector<float>::type; + const QuantizationInfo &qinfo_out = output->info()->quantization_info(); + const auto vf = vcast<to_vector, from_vector>(vin); + const auto vout = vquantize(vf, qinfo_out); + store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::U32: + { + using to_vector = typename cast_vector<uint32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::S32: + { + using to_vector = typename cast_vector<int32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::F32: + { + using to_vector = typename cast_vector<float>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + FromT val = get_value(in_ptr + x); + switch (output->info()->data_type()) + { + case DataType::U8: + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val); + break; + } + case DataType::QASYMM8: + { + const QuantizationInfo &qinfo_out = output->info()->quantization_info(); + const auto qval = qinfo_out.quantize(static_cast<float>(val), rounding_policy); + *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval; + break; + } + case DataType::U32: + { + *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val); + break; + } + case DataType::S32: + { + *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val); + break; + } + case DataType::F32: + { + *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + }, + in, out); +} + +void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + const auto &qinfo_in = input->info()->quantization_info(); + const auto &qinfo_out = output->info()->quantization_info(); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + using from_vector = typename cast_vector<float>::type; + const auto vf = wrapper::vloadq(in_ptr + x); + const auto vin = vdequantize(vf, qinfo_in); + switch (output->info()->data_type()) + { + case DataType::U8: + { + using to_vector = typename cast_vector<uint8_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::QASYMM8: + { + using to_vector = typename cast_vector<float>::type; + const auto vf = vcast<to_vector, from_vector>(vin); + const auto vout = vquantize(vf, qinfo_out); + store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::U32: + { + using to_vector = typename cast_vector<uint32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::S32: + { + using to_vector = typename cast_vector<int32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::F32: + { + using to_vector = typename cast_vector<float>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + qasymm8_t qval_in = *(in_ptr + x); + const auto val = qinfo_in.dequantize(qval_in); + + switch (output->info()->data_type()) + { + case DataType::U8: + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val); + break; + } + case DataType::QASYMM8: + { + const auto qval_out = qinfo_out.quantize(val, rounding_policy); + *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out; + break; + } + case DataType::U32: + { + *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val); + break; + } + case DataType::S32: + { + *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val); + break; + } + case DataType::F32: + { + *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + }, + in, out); +} +} // namespace + +NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE) +{ +} + +void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype)); + + _input = input; + _output = output; + _input_subtype = input_subtype; + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype)); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + return Status{}; +} + +void NECastKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_input->info()->data_type()) + { + case DataType::U8: + if (_input_subtype == SubDataType::BOOL) + { + run_cast<bool>(_input, _output, window); + } + else + { + run_cast<uint8_t>(_input, _output, window); + } + break; + case DataType::QASYMM8: + run_cast_qasymm8(_input, _output, window); + break; + case DataType::U32: + run_cast<uint32_t>(_input, _output, window); + break; + case DataType::S32: + run_cast<int32_t>(_input, _output, window); + break; + case DataType::F32: + run_cast<float>(_input, _output, window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp new file mode 100644 index 000000000..8a2223c26 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include <arm_neon.h> +#include <cstdint> + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2); + + const DataLayout data_layout = input->data_layout(); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != + 0); + // Validate output if initialized + if (output->total_size() != 0) + { + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} +} // namespace + +NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx() + : _input(nullptr), _output(nullptr), _block_shape() +{ +} + +void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output, + int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); + + _input = input; + _output = output; + _block_shape = block_shape; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + ICPPKernel::configure(win); +} + +Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); + return Status{}; +} + +void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); + + const int idx_channel = + get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL); + const int depth_size = _input->info()->dimension(idx_channel); + const int r = (depth_size / (_block_shape * _block_shape)); + const int element_size = _input->info()->element_size(); + + Window slice_out = window.first_slice_window_3D(); + + // The slice_out slice does not move + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Main loop for NCHW and NHWC + if (_input->info()->data_layout() == DataLayout::NCHW) + { + Window slice_in = window.first_slice_window_2D(); + do + { + Iterator in(_input, slice_in); + execute_window_loop(slice_in, + [&](const Coordinates &id) { + const int x = id.x(); + const int y = id.y(); + + const int z = id.z() % r; + const int out_x = x * _block_shape + (id.z() / r) % _block_shape; + const int out_y = y * _block_shape + (id.z() / r) / _block_shape; + Coordinates output_coords{out_x, out_y, z, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_2D(slice_in)); + } + else + { + Window slice_in = window.first_slice_window_3D(); + do + { + Iterator in(_input, slice_in); + execute_window_loop(slice_in, + [&](const Coordinates &id) { + const int x = id.y(); + const int y = id.z(); + + const int z = id.x() % r; + const int out_x = x * _block_shape + (id.x() / r) % _block_shape; + const int out_y = y * _block_shape + (id.x() / r) / _block_shape; + Coordinates output_coords{z, out_x, out_y, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_3D(slice_in)); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp new file mode 100644 index 000000000..cebd614df --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <algorithm> +#include <arm_neon.h> +#include <cstdint> +#include <map> +#include <string> + +namespace arm_compute +{ +class Coordinates; + +namespace +{ +template <ElementWiseUnaryEx op, typename ScalarType> +inline ScalarType elementwise_op_scalar(const ScalarType &a) +{ + switch (op) + { + case ElementWiseUnaryEx::NEG: + return -a; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <ElementWiseUnaryEx op, typename VectorType> +inline VectorType elementwise_op(const VectorType &a) +{ + switch (op) + { + case ElementWiseUnaryEx::NEG: + return wrapper::vneg(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <ElementWiseUnaryEx op, typename ScalarType> +void elementwise_op(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, + elementwise_op<op>(wrapper::vloadq(input_ptr + x))); + } + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x)); + } + }, + input, output); +} + +template <ElementWiseUnaryEx op> +std::function<void(const ITensor *input, ITensor *output, const Window &window)> +configure_func(const ITensor *input, ITensor *output) +{ + std::string function_to_call("op_"); + function_to_call += string_from_data_type(input->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *> + map_function = { + {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>}, + }; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + map_function["op_F16_F16"] = &elementwise_op<op, float16_t>; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + + auto it = map_function.find(function_to_call); + + if (it != map_function.end()) + { + auto func = it->second; + return [func](const ITensor *input, ITensor *output, const Window &window) { + func(input, output, window); + }; + } + return nullptr; +} +} // namespace + +NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx() + : _function(nullptr), _input(nullptr), _output(nullptr) +{ +} + +void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input, + ITensor *output) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info())); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Configure kernel window + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info()); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); + + Window win = calculate_max_window(valid_region); + + _input = input; + _output = output; + + INEKernel::configure(win); + + switch (op) + { + case ElementWiseUnaryEx::NEG: + _function = configure_func<ElementWiseUnaryEx::NEG>(input, output); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input, + const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32, + DataType::S32); + + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); + } + + return Status{}; +} + +Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output)); + return Status{}; +} + +void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_function == nullptr); + _function(_input, _output, window); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..5401afea0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() + : _input(nullptr), _lookups(nullptr), _output(nullptr) +{ +} + +void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output, + const ITensor *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Auto initialize output if not initialized + auto out_shape = input->info()->tensor_shape(); + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions()); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input, + const arm_compute::ITensorInfo *output, + const arm_compute::ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + return Status{}; +} + +void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const int32_t lookup = *reinterpret_cast<int32_t *>( + _lookups->ptr_to_element(Coordinates{id[lookup_dim]})); + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp new file mode 100644 index 000000000..ce2413dc1 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +namespace arm_compute +{ +namespace +{ +/** Validate the indices + * + * Validate that indices are not negative + * + * @param[in] indices Indices tensor info. + */ +template <typename U> void validate_indices(const ITensor *indices) +{ + for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i) + { + ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0); + } +} + +} // namespace + +NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {} + +template <typename U> +inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Iterator output_it(_output, window); + execute_window_loop( + window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices->info()->num_dimensions(), 0); + + U new_index; + switch (_indices->info()->num_dimensions()) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); + break; + case 2: + new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); + break; + case 3: + new_index = *( + reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(0, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +template <typename U> +void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator output_it(_output, output_window); + execute_window_loop( + output_window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices->info()->num_dimensions(), _axis); + + U new_index; + switch (_indices->info()->num_dimensions()) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); + break; + case 2: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); + break; + case 3: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(_axis, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), + _input->info()->dimension(0) * _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, + int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + _input = input; + _indices = indices; + _output = output; + _axis = axis; + + if (_axis < 0) + { + _axis += input->info()->num_dimensions(); + } + ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions())); + + if (0 == _axis) + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_0_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_0_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + else + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_n_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_n_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + // Output auto initialization if not yet initialized + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); + + // Create window + Window win = calculate_max_window(*output->info(), Steps()); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + + if (axis < 0) + { + axis += input->num_dimensions(); + } + + ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window, info); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp new file mode 100644 index 000000000..391337bfb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <unordered_map> + +using namespace arm_compute; + +namespace +{ +constexpr size_t NOT_HIT = 0xFFFFFFFF; +} // namespace + +NEHashtableLookupKernel::NEHashtableLookupKernel() + : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} +{ +} + +void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys, + const ITensor *input, ITensor *output, ITensor *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Auto initialize output if not initialized + auto out_shape{input->info()->tensor_shape()}; + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + // Auto initialize hits if not initialized + auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1)); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + // Validate in case of configured hits + if (hits->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1)); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + } + + return Status{}; +} + +void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + const int const_0 = _output->info()->data_type() == DataType::QASYMM8 + ? _output->info()->quantization_info().offset + : 0; + + std::unordered_map<int32_t, size_t> key_index_map; + for (size_t n = 0; n < _keys->info()->dimension(0); ++n) + { + const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n})); + key_index_map[key] = n; + } + std::vector<size_t> lookup_indices; + for (size_t k = 0; k < _lookups->info()->dimension(0); ++k) + { + const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k})); + const auto it = key_index_map.find(key); + if (it == key_index_map.end()) + { + lookup_indices.emplace_back(NOT_HIT); + *_hits->ptr_to_element({k}) = 0; + } + else + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= _keys->info()->dimension(0)) + ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices.emplace_back(it->second); + *_hits->ptr_to_element({k}) = 1; + } + } + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const auto lookup = lookup_indices.at(id[lookup_dim]); + if (lookup == NOT_HIT) + { + memset(output_it.ptr(), const_0, + _output->info()->dimension(0) * _output->info()->element_size()); + } + else + { + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + } + + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..1ea77fb5c --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +template <typename T> +void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window) +{ + /** NEON vector tag type. */ + using ExactTagType = + typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + // Clear X/Y dimensions on execution window as we handle the planes manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); + const auto channel_idx = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + + Iterator input_it(input, win); + execute_window_loop( + win, + [&](const Coordinates &id) { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<T>(0.f); + auto sum_squares_h_w = static_cast<T>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); + vec_sum_squares_h_w = + wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); + } + + auto vec2_sum_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), + wrapper::vgetlow(vec_sum_squares_h_w)); + for (int i = 0; i < window_step_x / 4; ++i) + { + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + } + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = *(input_ptr + x); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + auto gamma_val = 1.0f; + if (gamma != nullptr) + { + gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); + } + const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); + auto beta_val = 0.0f; + if (beta != nullptr) + { + beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); + } + const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + vec_val = wrapper::vloadq(input_ptr + x); + vec_val = wrapper::vadd( + wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); + wrapper::vstore(output_ptr + x, vec_val); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; + } + }, + input_plane_it, output_plane_it); + }, + input_it); +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, + "NHWC data layout is not supported by the kernel directly"); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + if (gamma != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + gamma->dimension(0), + "Gamma's size must be the same as size of input's channel"); + } + + if (beta != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + beta->dimension(0), + "Beta's size must be the same as size of input's channel"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx() + : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), + _epsilon(1e-12) +{ +} + +void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output, + ITensor *gamma, ITensor *beta, float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); + + if (_input->info()->data_type() == DataType::F32) + { + _func = &instance_normalization_nchw<float>; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else if (_input->info()->data_type() == DataType::F16) + { + _func = &instance_normalization_nchw<float16_t>; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else + { + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + (*_func)(_input, _output, _gamma, _beta, _epsilon, window); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp new file mode 100644 index 000000000..de218d489 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + // Checks performed when output is configured + if ((output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +inline int32x4x4_t load_value(const int32_t *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} + +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); + wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale) +{ + const float32x4_t vscale = vdupq_n_f32(scale); + + const float32x4x4_t ret = {{ + vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), + }}; + return ret; +} +} // namespace + +NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) +{ +} + +void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor, + ITensor *output, float multiplier) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + _multiplier = multiplier; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier) +{ + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + + return Status{}; +} + +template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); + scale *= _multiplier; + + const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = input_ptr[x] * scale; + } + }, + input, output); +} + +void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_output->info()->data_type()) + { + case DataType::F32: + NEMultiplyScaleFactorKernel::multiply<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEMultiplyScaleFactorKernel::multiply<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp new file mode 100644 index 000000000..ad1bb9051 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +using namespace arm_compute; +namespace +{ + +/** Conditional element-wise operations */ +enum class ConditionalOperation +{ + PRELU, /**< (x * y) for x < 0, x for x >= 0 */ +}; + +template <ConditionalOperation op, typename ScalarType> +inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case ConditionalOperation::PRELU: + res = a < 0 ? a * b : a; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <ConditionalOperation op> +inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b, + QuantizationInfo qinfo) +{ + return qinfo.quantize(elementwise_conditional_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP); +} + +template <ConditionalOperation op, typename VectorType> +inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b) +{ + VectorType res = {0, 0, 0, 0}; + VectorType const_0 = {0, 0, 0, 0}; + + switch (op) + { + case ConditionalOperation::PRELU: + res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b)); + ; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <ConditionalOperation op> +inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + float32x4x4_t out = {{ + elementwise_conditional_op<op>(a.val[0], b.val[0]), + elementwise_conditional_op<op>(a.val[1], b.val[1]), + elementwise_conditional_op<op>(a.val[2], b.val[2]), + elementwise_conditional_op<op>(a.val[3], b.val[3]), + }}; + return out; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline VectorType elementwise_conditional_op_broadcast(const VectorType &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_conditional_op<op>(reorder ? broadcast_vector : a, + reorder ? a : broadcast_vector); +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, + const ScalarType *input2_ptr, ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b)); + } + return x; +} + +template <ConditionalOperation op> +inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x, + int window_step_x, const uint8_t *input1_ptr, + const uint8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, + float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder)); + } + return x; +} + +template <ConditionalOperation op> +inline int elementwise_conditional_op_quantized_broadcast_loop( + int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af, + reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>, + &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>, + &elementwise_conditional_op_loop<op, ScalarType, VectorType>); +} + +template <ConditionalOperation op> +void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>, + &elementwise_conditional_op_quantized_broadcast_loop<op>, + &elementwise_conditional_op_quantized_loop<op>); +} +} // namespace + +NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info())); + + // Configure kernel window + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); + + Window win = calculate_max_window(valid_region); + + _input = input; + _alpha = alpha; + _output = output; + INEKernel::configure(win); +} + +void NEPReLUKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + if (_input->info()->data_type() == DataType::F32) + { + elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha, + _output, window); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output, + window); + } + else + { + ARM_COMPUTE_ERROR("Wrong Type"); + } +} + +Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, + const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output); + + const TensorShape out_shape = + TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Checks performed when output is configured + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output)); + + return Status{}; +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp new file mode 100644 index 000000000..acf0092eb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + return Status{}; +} + +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +inline float32x4_t round(const float32x4_t &fv) +{ + const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f); + const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f); + // If value < 0, mask = -1, else mask = 0 + int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4)); + return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4)); +} + +inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale) +{ + const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv); + const int32x4_t vposend = vdupq_n_s32(max_scale); + const int32x4_t vnagend = vdupq_n_s32(-max_scale); + + const int32x4x4_t rf = {{ +#ifdef __aarch64__ + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#else //__aarch64__ + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#endif //__aarch64__ + }}; + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} +} // namespace + +NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel() + : _input(nullptr), _output(nullptr), _scale_factor(nullptr) +{ +} + +void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output, + ITensor *scale_factor) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), output->info(), scale_factor->info())); + + _input = input; + _output = output; + _scale_factor = scale_factor; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor)); + + return Status{}; +} + +template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window; + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + const auto dim_x = _input->info()->dimension(0); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + const auto start = reinterpret_cast<const T *>(input.ptr()); + const auto min_max = std::minmax_element(start, start + dim_x); + const auto int8_scale = 127; + auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); + if (range == 0) + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; + range = 1; + } + else + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; + } + const auto scale_factor_inv = int8_scale / range; + + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], + vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); + quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); + output_ptr[x] = static_cast<int8_t>(quantized); + } + }, + input, output); +} + +void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_input->info()->data_type()) + { + case DataType::F32: + NEQuantizationSymmetricKernel::quantize<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEQuantizationSymmetricKernel::quantize<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp new file mode 100644 index 000000000..59e7d9beb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +float32x2_t calculate_min(float32x4_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +float32x2_t calculate_max(float32x4_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmax(pmax, pmax); +} +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +int32x2_t calculate_min(int32x4_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +int32x2_t calculate_max(int32x4_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmax(pmax, pmax); +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +inline uint8x8_t calculate_min(uint8x16_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +inline uint8x8_t calculate_max(uint8x16_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +inline float16x4_t calculate_min(float16x8_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +inline float16x4_t calculate_max(float16x8_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <class F> class Reducer +{ +public: + static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_1D(); + Window out_slice = out_window.first_slice_window_1D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), op); + } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); + } + static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), + output->info()->dimension(1))); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 1, op); + } while (in_window.slide_window_slice_2D(in_slice) && + out_window.slide_window_slice_2D(out_slice)); + } + static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimZ, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), + output->info()->dimension(2))); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_3D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 2, op); + } while (in_window.slide_window_slice_3D(in_slice) && + out_window.slide_window_slice_3D(out_slice)); + } + static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in/out window + Window in_window(window); + Window out_window(window); + + in_window.set(3, Window::Dimension(0, 1, 1)); + out_window.set(3, Window::Dimension(0, 1, 1)); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_4D(); + Window out_slice = out_window.first_slice_window_4D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 3, op); + } while (in_window.slide_window_slice_4D(in_slice) && + out_window.slide_window_slice_4D(out_slice)); + } +}; + +template <typename T, int S> struct RedOpX +{ + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + ARM_COMPUTE_UNUSED(in_info); + auto init_res_value = static_cast<T>(0.f); + switch (op) + { + case ReduceOperation::MIN: + case ReduceOperation::MAX: + { + init_res_value = *reinterpret_cast<T *>(input.ptr()); + break; + } + default: + break; + } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + + execute_window_loop(in_slice, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input); + + switch (op) + { + case ReduceOperation::MIN: + { + *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0); + break; + } + case ReduceOperation::MAX: + { + *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } +}; + +struct RedOpX_qasymm8 +{ + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + ARM_COMPUTE_UNUSED(in_info); + + uint8x16_t vec_res_value = {0}; + + if (op == ReduceOperation::MIN || op == ReduceOperation::MAX) + { + vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{}); + } + + execute_window_loop(in_slice, + [&](const Coordinates &) { + const auto vec_elements = wrapper::vloadq(input.ptr()); + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input); + + switch (op) + { + case ReduceOperation::MIN: + { + *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + break; + } + case ReduceOperation::MAX: + { + *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + } + } + } +}; + +template <typename T, int S> struct RedOpYZW +{ + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; + + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, int axis, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + + execute_window_loop( + in_slice, + [&](const Coordinates &) { + neon_vector vec_res_value = {0}; + switch (op) + { + case ReduceOperation::MIN: + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr())); + break; + } + default: + { + vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + break; + } + } + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr; + switch (axis) + { + case 1: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim))); + break; + case 2: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim))); + break; + case 3: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim))); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value); + }, + input, output); + } +}; + +struct RedOpYZW_qasymm8 +{ + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, int axis, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + + execute_window_loop( + in_slice, + [&](const Coordinates &) { + auto vec_res_value = wrapper::vloadq(input.ptr()); + + for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) + { + uint8_t *in_ptr; + switch (axis) + { + case 1: + in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim)); + break; + case 2: + in_ptr = + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim)); + break; + case 3: + in_ptr = + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim)); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value); + }, + input, output); + } +}; + +void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, + const ReduceOperation op) +{ + const bool is_complex = (input->info()->num_channels() == 2); + if (is_complex) + { + ARM_COMPUTE_ERROR("Not supported"); + } + + switch (axis) + { + case 0: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, + RedOpX<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op); + case DataType::S32: + return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), + op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 1: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 2: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 3: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + default: + ARM_COMPUTE_ERROR("Unsupported reduction axis"); + } +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, + ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(op); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + + if (input->num_channels() == 1) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, + DataType::F16, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); + + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(op); + + // Calculate output shape and set if empty + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + + // Output auto initialization if not yet initialized + DataType output_data_type = input->data_type(); + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + + return std::make_tuple(err, win); +} +} // namespace + +NEReductionOperationKernelEx::NEReductionOperationKernelEx() + : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX), + _border_size() +{ +} + +BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; } + +void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + unsigned int num_elems_processed_per_iteration = + 16 / data_size_from_type(input->info()->data_type()); + + _input = input; + _output = output; + _border_size = + (axis == 0) + ? BorderSize(0, num_elems_processed_per_iteration - + (input->info()->dimension(0) % num_elems_processed_per_iteration), + 0, 0) + : BorderSize(); + _op = op; + _reduction_axis = axis; + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>( + validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op))); + + return Status{}; +} + +void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + reduce_op(window, _input, _output, _reduction_axis, _op); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp new file mode 100644 index 000000000..36a2f55a9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include <arm_neon.h> +#include <cstdint> + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + + ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); + + // Validate output if initialized + if (output->total_size() != 0) + { + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] != + output->tensor_shape()[idx_batch]); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) != + 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != + output->tensor_shape().total_size()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} +} // namespace + +NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx() + : _input(nullptr), _output(nullptr), _block_shape() +{ +} + +void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output, + int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); + + _input = input; + _block_shape = block_shape; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + INEKernel::configure(win); +} + +Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); + return Status{}; +} + +void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); + + const DataLayout data_layout = _input->info()->data_layout(); + const int channel_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int element_size = _input->info()->element_size(); + + const size_t channel_size = _input->info()->dimension(channel_idx); + + Window slice_out = window.first_slice_window_3D(); + + int batch_id = 0; + + // Main loop for NCHW and NHWC + if (_output->info()->data_layout() == DataLayout::NCHW) + { + do + { + Iterator out(_output, slice_out); + execute_window_loop(slice_out, + [&](const Coordinates &id) { + const size_t channel_id = id.z(); + const size_t in_x = + id.x() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = + id.y() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{in_x, in_y, z, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); + ++batch_id; + } while (window.slide_window_slice_3D(slice_out)); + } + else + { + do + { + Iterator out(_output, slice_out); + execute_window_loop(slice_out, + [&](const Coordinates &id) { + const size_t channel_id = id.x(); + const size_t in_x = + id.y() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = + id.z() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{z, in_x, in_y, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); + ++batch_id; + } while (window.slide_window_slice_3D(slice_out)); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp new file mode 100644 index 000000000..94242b56b --- /dev/null +++ b/compute/ARMComputeEx/src/core/UtilsEx.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Error.h" + +using namespace arm_compute; + +const std::pair<unsigned int, unsigned int> +arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + const unsigned int padx = info.pad_left() + info.pad_right(); + const unsigned int pady = info.pad_top() + info.pad_bottom(); + + ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1); + ARM_COMPUTE_ERROR_ON(kernel_width <= padx); + ARM_COMPUTE_ERROR_ON(kernel_height <= pady); + + // Find the transpose conv out dimensions + // transpose conv out: + // tconv_out + pad = 1 + (in - 1) * stride + invalid + // tconv_out = 1 + (in - 1) * stride + invalid - pad + const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right; + const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom; + + return std::make_pair<unsigned int, unsigned int>(w, h); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp new file mode 100644 index 000000000..158fe0b0c --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/CLFunctionsEx.h" + +// NOTE This empty file aims to validate "CLFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp new file mode 100644 index 000000000..ae64a6edd --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLArgOperation.h" + +#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +namespace arm_compute +{ + +CLArgOperation::CLArgOperation() +{ + // DO NOTHING +} + +void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, + ArgOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op)); + _input = input; + _output = output; + _axis = axis; + _arg_op = op; + // NOTE The argminmax_axis must have no duplication. + _num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = _num_of_kernels - 1; + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _argop_kernels = + arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels); + + TensorShape shape{input->info()->tensor_shape()}; + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(_axis[i], 1); + _interm_tensors[i].allocator()->init( + TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()) + .set_data_layout(input->info()->data_layout())); + _interm_tensors[i].allocator()->allocate(); + } + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ArgMinMax on all kernels + for (size_t i = 0; i < _num_of_kernels; i++) + { + _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op); + } +} + +Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis, + const ITensorInfo *output, ArgOperation op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(axis[i], 1); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate argminmax only on all kernels + for (size_t i = 0; i < num_of_kernels; i++) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op)); + } + + return Status{}; +} + +void CLArgOperation::run() +{ + for (size_t i = 0; i < _num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_argop_kernels[i]); + } +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp new file mode 100644 index 000000000..7c5fe5eda --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h" + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op) +{ + auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp new file mode 100644 index 000000000..742fc6f59 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLCast.h" + +#include "arm_compute/core/CL/kernels/CLCastKernel.h" + +using namespace arm_compute; + +void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype) +{ + auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>(); + k->configure(input, output, input_subtype); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp new file mode 100644 index 000000000..c2e4ca9ff --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h" + +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +using namespace arm_compute; + +void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp new file mode 100644 index 000000000..2781784ca --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +using namespace arm_compute; + +void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..c6b166163 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h" + +using namespace arm_compute; + +void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input, + const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, + arm_compute::ICLTensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_cl_buffer.info(), + _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( + _input->info()->data_layout())); + _cl_reshape.configure(_input, &_cl_buffer); + + _cl_fc.configure(&_cl_buffer, _weights, _biases, _output); + + // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_buffer.allocator()->allocate(); + } + else + { + _cl_fc.configure(_input, _weights, _biases, _output); + } +} + +void CLFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _cl_reshape.run(); + + _cl_fc.run(); +} + +void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc.prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp new file mode 100644 index 000000000..6cad9bd2e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLGatherEx.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +using namespace arm_compute; + +void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, + int axis) +{ + auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return CLGatherExKernel::validate(input, indices, output, axis); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp new file mode 100644 index 000000000..7180e9356 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h" + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +using namespace arm_compute; + +void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..86ea5a66d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} + +void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, float epsilon) +{ + auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>(); + k->configure(input, output, gamma, beta, epsilon); + _kernel = std::move(k); +} + +Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp new file mode 100644 index 000000000..be35ea732 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLNeg.h" + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +using namespace arm_compute; + +void CLNeg::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp new file mode 100644 index 000000000..38adedd10 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLPReLU.h" + +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>(); + k->configure(input, alpha, output); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp new file mode 100644 index 000000000..2a34c0664 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), + _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), + _gemm_output(), _add_output(), _is_prepared(false) +{ +} + +Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info) +{ + const int idx_width = 0; + const int idx_height = 1; + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, + output); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != + recurrent_weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != + recurrent_weights->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + hidden_state->tensor_shape()); + + auto shape_info = + TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, + input->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); + ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate( + ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info)); + + return Status{}; +} + +void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *recurrent_weights, const ICLTensor *bias, + ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(), + recurrent_weights->info(), bias->info(), + hidden_state->info(), output->info(), info)); + + const int idx_height = 1; + TensorShape shape = + compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + + _is_prepared = false; + + _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + + // Manage intermediate buffers and configure + _memory_group.manage(&_fully_connected_out); + _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); + + _memory_group.manage(&_gemm_output); + _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); + + _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _memory_group.manage(&_add_output); + + _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, + &_add_output, ConvertPolicy::SATURATE); + + _fully_connected_out.allocator()->allocate(); + _gemm_output.allocator()->allocate(); + + _activation_kernel.configure(&_add_output, hidden_state, info); + _add_output.allocator()->allocate(); + + _copy_kernel.configure(hidden_state, output); +} + +void CLRNNLayerEx::run() +{ + prepare(); + + _memory_group.acquire(); + + _fully_connected_kernel.run(); + _gemm_state_f.run(); + CLScheduler::get().enqueue(_add_kernel); + CLScheduler::get().enqueue(_activation_kernel); + + // copy hidden out to output + CLScheduler::get().enqueue(_copy_kernel); + + _memory_group.release(); +} + +void CLRNNLayerEx::prepare() +{ + if (!_is_prepared) + { + _fully_connected_kernel.prepare(); + _gemm_state_f.prepare(); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp new file mode 100644 index 000000000..13a25c901 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLReduceOperation.h" + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), + _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() +{ +} + +Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, + const ReduceOperation &op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1, false); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + interm_tensors[i].set_data_layout(input->data_layout()); + interm_tensors[i].set_quantization_info(input->quantization_info()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate ReduceOperation only on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + } + + if (!keep_dims) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); + } + + return Status{}; +} + +void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, + const std::set<uint32_t> &axis, bool keep_dims, + ReduceOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op)); + + _axis = axis; + + _input = input; + _output = output; + _keep_dims = keep_dims; + + // NOTE The axis must have no duplication. + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = + arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ReduceOperation on all kernels + TensorShape shape{input->info()->tensor_shape()}; + auto it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + shape.set(*it, 1, false); + if (!keep_dims || i != (num_of_kernels - 1)) + { + _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); + _memory_group.manage(&_interm_tensors[i]); + } + _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op); + if (i != 0) + { + _interm_tensors[i - 1].allocator()->allocate(); + } + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output); + _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate(); + } +} + +void CLReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + const size_t num_of_kernels = _axis.size(); + for (size_t i = 0; i < num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_reduce_kernels[i]); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp new file mode 100644 index 000000000..c03826891 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +using namespace arm_compute; + +void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>(); + k->configure(input, block_size, padding_size, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp new file mode 100644 index 000000000..0f455f96f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +using namespace arm_compute; + +void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp new file mode 100644 index 000000000..80d50ad94 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLTopKV2.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "../../topk_v2.h" + +namespace arm_compute +{ + +CLTopKV2::CLTopKV2() + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), + _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel()*/ +{ +} + +void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits, int bits) +{ + _total_bits = total_bits; + _bits = bits; + _n = input->info()->tensor_shape()[0]; + + // _total_bits should be divided by _bits. + ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0); + + _k = k; + _radix = 1 << bits; + + _input = input; + _values = values; + _indices = indices; + + std::string topk_env; + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n); + } + else if (topk_env == "GPU") + { + // n should be divided by (_GROUPS * _ITEMS) + ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0); + + _hist_buf_size = _radix * _GROUPS * _ITEMS; + _glob_sum_buf_size = _HISTOSPLIT; + + _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _hist_buf_size); + _glob_sum_buf = + cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int)); + _in_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _out_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _in_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _out_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _p_in_key_buf = &_in_key_buf; + _p_out_key_buf = &_out_key_buf; + _p_in_ind_buf = &_in_ind_buf; + _p_out_ind_buf = &_out_ind_buf; + + _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n); + _hist_kernel.configure(&_hist_buf, bits, _n); + _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits); + _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _reorder_kernel.configure(&_hist_buf, bits, _n); + _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n); + _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n); + _store_kernel.configure(values, indices, k, _n); + } + else +#endif // Disable GPU implementation + { + // DO NOTHING for CPU. + } +} + +void CLTopKV2::run() +{ + std::string topk_env; +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + run_on_gpu_single_quicksort(); + } + else if (topk_env == "GPU") + { + run_on_gpu(); + } + else +#endif + { + run_on_cpu(); + } +} + +#if 0 +void CLTopKV2::run_on_gpu_single_quicksort() +{ + // This is a single threaded quick sort implementation. + CLScheduler::get().enqueue(_qs_kernel, false); + + arm_compute::CLScheduler::get().sync(); +} + +void CLTopKV2::run_on_gpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + + // 1. CLTopKV2Init set key buffer and index buffer. + // - Key buffer is set as the same value of the layer's input + // - Values in the index buffer are set as their indices. + CLScheduler::get().enqueue(_init_kernel, false); + + int n_passes = _total_bits / _bits; + + // 2. Repeat (total_bits/bits) times. + // - total_bits is the number of bits of the data type (e.g., 32 for float) + // - bits defines number of buckets (e.g. 16 buckets where bit is 4) + for (int pass = 0; pass < n_passes; ++pass) + { + arm_compute::CLScheduler::get().sync(); + + // 2.1. Calculate histogram with _GROUPS * _ITEMS threads + _hist_kernel.setPass(pass, _p_in_key_buf); + CLScheduler::get().enqueue(_hist_kernel, false); + + // 2.2. Calculate prefix sum locally with multiple threads + CLScheduler::get().enqueue(_scan_hist_kernel, false); + // 2.3. Calculate prefix sum within a work group + CLScheduler::get().enqueue(_glob_scan_hist_kernel, false); + // 2.4. Calculate global prefix sum + CLScheduler::get().enqueue(_paste_hist_kernel, false); + + // 2.5. Reorder keys and indices based on the global prefix sum + _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_kernel, false); + + cl::Buffer *tmp; + // swap key buffers + tmp = _p_in_key_buf; + _p_in_key_buf = _p_out_key_buf; + _p_out_key_buf = tmp; + + // swap index buffers + tmp = _p_in_ind_buf; + _p_in_ind_buf = _p_out_ind_buf; + _p_out_ind_buf = tmp; + } + + // 3. Get the first negative index + // Because we swap in_buf and out_buf at the end of the above for loop, + // the output buffers are in bufs. + _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf); + CLScheduler::get().enqueue(_find_first_negative_kernel, false); + + // 4. Correct odering of negatives + // - Since radix sort does not consider negatives, negatives are considered as bigger values + // than positives. + // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf + _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, + _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_negatives_kernel, false); + + // 5. Extract top k values from sorted keys and indices. + _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_store_kernel, false); + + arm_compute::CLScheduler::get().sync(); + +#if 0 + // below code is left for debugging. + int first_neg; + q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg); + std::cout << "first neg = " << first_neg << std::endl; + + float in_key[_n]; + q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl; + } + + float out_key[_n]; + q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl; + } + + int in_ind[_n]; + q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl; + } + + int out_ind[_n]; + q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl; + } + + int hist_buf[_hist_buf_size]; + q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf); + for(uint32_t i = 0 ; i < _hist_buf_size; ++i) { + std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl; + } + + int glob_sum_buf[_glob_sum_buf_size]; + q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf); + for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) { + std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl; + } + +#endif +} +#endif // Disable GPU implementation + +void CLTopKV2::run_on_cpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + // const Window& w = _topkv2_kernel.window(); + + _input->map(q); + _values->map(q); + _indices->map(q); + + // int row_size = (w[0].end() - w[0].start()) / w[0].step(); + int row_size = _input->info()->tensor_shape()[0]; + int rank = _input->info()->num_dimensions(); + + if (rank > 2) + throw std::runtime_error("Not supported type."); + + int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1); + + if (_input->info()->data_type() == DataType::F32) + { + nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k, + (int32 *)_indices->buffer(), (float *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::S32) + { + nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (int32_t *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (uint8_t *)_values->buffer()); + } + else + { + throw std::runtime_error("Not supported type."); + } + + _input->unmap(q); + _values->unmap(q); + _indices->unmap(q); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp new file mode 100644 index 000000000..40e21671d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CPP/CPPScheduler.h" + +#include <memory> +#include <tuple> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _scale_f(), + _conv_f(), + _flip_weights(), + _scaled_output(), + _original_weights(nullptr), + _weights_flipped(), + _is_prepared(false) +{ +} + +Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); + + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); + + const unsigned int kernel_x = weights->dimension(idx_w); + const unsigned int kernel_y = weights->dimension(idx_h); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1, + "invalid_right must be smaller than kernel_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1, + "inner_border_top must be smaller than kernel_y"); + + // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added. + auto out_dims = transposeconv_output_dimensions( + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], + "Output's depth is invalid."); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + ARM_COMPUTE_RETURN_ON_ERROR( + CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, weights_info)); + + return Status{}; +} + +void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const DataLayout data_layout = input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + _original_weights = weights; + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped); + + // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were + // added. + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + // Output auto initialization if not yet initialized + auto_init_if_empty( + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _is_prepared = weights_info.retain_internal_weights(); + + _memory_group.manage(&_scaled_output); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order + // to match output shape + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + // configure scale function + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info); + _scaled_output.allocator()->allocate(); +} + +void CLTransposeConvLayer::run() +{ + prepare(); + + _memory_group.acquire(); + + _scale_f.run(); + _conv_f.run(); + + _memory_group.release(); +} + +void CLTransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + _weights_flipped.map(true); + _original_weights->map(CLScheduler::get().queue(), true); + CPPScheduler::get().schedule(&_flip_weights, Window::DimZ); + _weights_flipped.unmap(); + _original_weights->unmap(CLScheduler::get().queue()); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp new file mode 100644 index 000000000..0ce3e6700 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" + +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include <cmath> +#include <memory> +#include <tuple> + +using namespace arm_compute; + +CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT + : _upsample(), + _output(nullptr) +{ +} + +Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info); +} + +void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _output = output; + _upsample.configure(input, _output, inner_border, info); +} + +void CLTransposeConvLayerUpsample::run() +{ + _output->map(CLScheduler::get().queue(), true); + if (is_data_type_quantized_asymmetric(_output->info()->data_type())) + { + const uint8_t quantized_zero = _output->info()->quantization_info().offset; + std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero); + } + else + { + memset(_output->buffer(), 0, _output->info()->total_size()); + } + _output->unmap(CLScheduler::get().queue()); + + CLScheduler::get().enqueue(_upsample, false); +} diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp new file mode 100644 index 000000000..f8e0ef8a6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" + +#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info) +{ + auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>(); + k->configure(input, output, info); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp new file mode 100644 index 000000000..80fbf359d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/NEFunctionsEx.h" + +// NOTE This empty file aims to validate "NEFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp new file mode 100644 index 000000000..5ba465b61 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEArgMinMax.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ + +template <ReductionOperation OP> +NEArgMinMaxStatic<OP>::NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernel(), _reduced_out(), _reshape() +{ +} + +template <ReductionOperation OP> +Status NEArgMinMaxStatic<OP>::validate(const ITensorInfo *input, int axis, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + + TensorShape out_shape = input->tensor_shape(); + const int input_dims = input->num_dimensions(); + int axis_local = axis; + + // Convert negative axis + axis_local = wrap_around(axis_local, input_dims); + + ARM_COMPUTE_RETURN_ERROR_ON(axis_local > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local) > input->num_dimensions() - 1); + out_shape.remove_dimension(axis_local); + + const TensorInfo out_info = output->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +template <ReductionOperation OP> +void NEArgMinMaxStatic<OP>::configure(ITensor *input, int axis, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + int axis_local = axis; + const int input_dims = input->info()->num_dimensions(); + + // Convert negative axis + axis_local = wrap_around(axis_local, input_dims); + + // Perform reduction for axis + TensorShape intermediate_shape = input->info()->tensor_shape(); + intermediate_shape.set(axis_local, 1); + auto in = input; + + _reduced_out.allocator()->init(TensorInfo(intermediate_shape, output->info()->num_channels(), + output->info()->data_type(), + output->info()->quantization_info())); + _memory_group.manage(&_reduced_out); + _reduction_kernel.configure(in, axis_local, &_reduced_out, OP); + + // Allocate intermediate tensor + _reduced_out.allocator()->allocate(); + + // Configure reshape layer if we want to drop the dimensions + TensorShape out_shape = input->info()->tensor_shape(); + out_shape.remove_dimension(axis_local); + auto_init_if_empty(*output->info(), output->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_out, output); +} + +template <ReductionOperation OP> void NEArgMinMaxStatic<OP>::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + _reduction_kernel.run(); + _reshape.run(); +} + +// Supported Specializations +template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>; +template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>; +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp new file mode 100644 index 000000000..7c15fc453 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h" +#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h> + +#include "arm_compute/core/ITensor.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ + +template <BinaryLogicalOperation COP> +void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2, + ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(COP, input1, input2, output); + _kernel = std::move(k); +} + +template <BinaryLogicalOperation COP> +Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output); +} + +void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, + BinaryLogicalOperation op) +{ + auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(op, input1, input2, output); + _kernel = std::move(k); +} + +Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op) +{ + return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output); +} + +// Supported Specializations +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp new file mode 100644 index 000000000..f2490e4e8 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NECast.h" + +#include "arm_compute/core/NEON/kernels/NECastKernel.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) +{ + auto k = arm_compute::support::cpp14::make_unique<NECastKernel>(); + k->configure(input, output, input_subtype); + _kernel = std::move(k); +} + +Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + return NECastKernel::validate(input, output, input_subtype); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp new file mode 100644 index 000000000..db419e3a8 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) +{ + auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>(); + k->configure(input, output, block_shape); + _kernel = std::move(k); +} + +Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp new file mode 100644 index 000000000..a95018a28 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h" + +#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ +void NENegLayer::configure(const ITensor *input, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernelEx>(); + k->configure(ElementWiseUnaryEx::NEG, input, output); + _kernel = std::move(k); +} +Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx::NEG, input, output); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp new file mode 100644 index 000000000..00c3ed94f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" + +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) +{ + auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp new file mode 100644 index 000000000..d604fedbf --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output)); + + return Status{}; +} +} // namespace + +void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return NETransposeKernel::validate(input, output); +} + +NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), + _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), + _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), + _accumulate_biases(false), _is_prepared(false) +{ +} + +void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); +} + +void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _accumulate_biases = false; + _original_weights = weights; + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + bool _is_fc_after_conv; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1; + } + ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv, + "NEFullyConnectedHybridLayer does not support after conv"); + (void)_is_fc_after_conv; + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_output.allocator()->init( + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); + _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Quantize input + _quantized_input.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + _scale_factor.allocator()->init( + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); + + // GEMM + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output); + + // Multiply scale + _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output, + weights->info()->quantization_info().scale); + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; + + _quantized_input.allocator()->allocate(); + _scale_factor.allocator()->allocate(); + _gemmlowp_output.allocator()->allocate(); +} + +Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + + // Validate quantization kernel + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); + + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate( + &gemmlowp_output, &scale_factor, output, weights->quantization_info().scale)); + + return Status{}; +} + +void NEFullyConnectedHybridLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Quantize input + NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY); + + // Run matrix multiply + _mm_gemmlowp.run(); + + // Multiply scale factor + NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY); + + // Accumulate biases if provided + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } +} + +void NEFullyConnectedHybridLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + _are_weights_reshaped = true; + // We can not release _original_weights because it can be used in other nodes + } + + // Prepare GEMM prepare and release unused weights + _mm_gemmlowp.prepare(); + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp new file mode 100644 index 000000000..a944f699a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + if (is_data_type_quantized_asymmetric(input.data_type())) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info(input.quantization_info().scale, + -input.quantization_info().offset); + const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, + -weights.quantization_info().offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate( + &input, &weights, nullptr, &output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + } + + return Status{}; +} +} // namespace + +NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), + _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), + _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), + _accumulate_biases(false), _is_quantized(false), _is_prepared(false) +{ +} + +void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = input->info()->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + + input->info()->set_quantization_info( + QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset)); + weights->info()->set_quantization_info( + QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); + + // Revert back QuantizatioInfo as input and weights could be used in other fully connected + // layers + input->info()->set_quantization_info(input_quantization_info); + weights->info()->set_quantization_info(weights_quantization_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */)); + } +} + +void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON( + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be + // linearized + + // Initialize output tensor for flatten + TensorShape shape_flatten = compute_flatten_shape(input->info()); + _flatten_output.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + shape_flatten)); + + // Configure flatten kernel + _memory_group.manage(&_flatten_output); + _flatten_kernel.configure(input, &_flatten_output); + + // Configure matrix multiply kernel + configure_mm(&_flatten_output, weights, output); + + // Allocate the output tensor for flatten once all the configure methods have been called + _flatten_output.allocator()->allocate(); +} + +void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(input, weights, output); +} + +void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _accumulate_biases = false; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _original_weights = weights; + + // Configure gemmlowp output + if (_is_quantized) + { + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::S32)); + } + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !_is_quantized) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1; + } + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_function.configure(weights, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Convert weights if needed + if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights.configure(weights_to_use, &_converted_weights_output, + input->info()->tensor_shape(), fc_info.weights_trained_layout); + + weights_to_use = &_converted_weights_output; + _are_weights_converted = false; + } + + ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output; + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, tmp_output); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, tmp_output); + } + + // Configure output stage for asymmetric quantized types + if (_is_quantized) + { + float multiplier = input->info()->quantization_info().scale * + weights->info()->quantization_info().scale / + output->info()->quantization_info().scale; + int output_multiplier; + int output_shift; + quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, + &output_shift); + _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, + output_shift, output->info()->quantization_info().offset); + _gemmlowp_output.allocator()->allocate(); + } + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; +} + +Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + + const ITensorInfo &flatten_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *input_to_use = input; + const ITensorInfo *weights_to_use = weights; + const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); + input_to_use = &flatten_input; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output)); + + // Validate output stage for asymmetric quantized types + if (is_quantized) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( + &gemmlowp_output, biases, output)); + } + + return Status{}; +} + +void NEFullyConnectedLayerEx::run() +{ + if (!_is_prepared) + { + if (!_are_weights_reshaped) + _reshape_weights_output.allocator()->allocate(); + if (!_are_weights_converted) + _converted_weights_output.allocator()->allocate(); + _is_prepared = true; + } + + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Reshape of the weights + if (!_are_weights_reshaped) + { + _reshape_weights_function.run(); + } + + // Convert weights if needed + if (!_are_weights_converted) + { + _convert_weights.run(); + } + + // Prepare GEMM prepare + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + } + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Linearize input if it comes from a convolutional layer + if (_is_fc_after_conv) + { + NEScheduler::get().schedule(&_flatten_kernel, Window::DimY); + } + + // Run matrix multiply + if (_is_quantized) + { + _mm_gemmlowp.run(); + } + else + { + _mm_gemm.run(); + } + + // Accumulate biases if provided + if (_is_quantized) + { + _gemmlowp_output_stage.run(); + } + else + { + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } + } +} + +void NEFullyConnectedLayerEx::prepare() +{ +#if 0 // TODO Remove this block + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Pointer to current weights + const ITensor *cur_weights = _original_weights; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + cur_weights->mark_as_unused(); + cur_weights = &_reshape_weights_output; + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if (!_are_weights_converted) + { + _converted_weights_output.allocator()->allocate(); + _convert_weights.run(); + + cur_weights->mark_as_unused(); + _are_weights_converted = true; + } + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + // Prepare GEMM prepare and release unused weights + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + + // Release converted weights if unused + release_unused(&_reshape_weights_output); + release_unused(&_converted_weights_output); + + _is_prepared = true; + } +#endif +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..fcac3c7ae --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h" + +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h> + +using namespace arm_compute; + +void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input, + const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, + arm_compute::ITensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape, + KernelType kernel_type) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + const ITensor *input_to_use = input; + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _neon_reshape.configure(_input, &_neon_buffer); + input_to_use = &_neon_buffer; + } + + _neon_fc = [&]() { + if (kernel_type == KernelType::GENERAL) + { + auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); + + bool is_hybrid = input->info()->data_type() == DataType::F32 && + weights->info()->data_type() == DataType::S8; + + if (is_hybrid) + { + auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + } + }(); + + // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + if (_needs_reshape) + { + _neon_buffer.allocator()->allocate(); + } +} + +void NEFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _neon_reshape.run(); + + _neon_fc->run(); +} + +void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp new file mode 100644 index 000000000..11794a1ea --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/TensorAllocator.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), + _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), + _mtx_b_reduction_kernel(), _offset_contribution_kernel(), + _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), + _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), + _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), + _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), + _fuse_output_stage(false), _run_activation(false), _flip_signedness(false) +{ +} + +void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c, + ITensor *output, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate( + a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); + + const ITensor *matrix_a = a; + const ITensor *matrix_b = b; + GEMMInfo info = gemm_info; + + // Clear state + _mtx_a_reshape_kernel = nullptr; + _mtx_b_reshape_kernel = nullptr; + + // Set internal variables + _a_offset = a->info()->quantization_info().offset; + _b_offset = b->info()->quantization_info().offset; + _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; + _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); + _is_prepared = false; + _fused_assembly_path = false; + _original_b = b; + + const ITensor *a_to_use = a; + + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage + if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + _fuse_output_stage = true; + _memory_group.manage(&_mm_result_s32); + TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32); + _mm_result_s32.allocator()->init(info_mm_result_s32); + } + +#ifdef __aarch64__ +#if 0 // Can use after arm compute library v19.11 + switch (a->info()->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::U8: + case DataType::S8: + { + if (a_to_use->info()->data_type() == DataType::QASYMM8 && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + _asm_glue.configure(a_to_use, b, c, output, gemm_info); + _fused_assembly_path = _asm_glue.is_configured(); + } + else + { + _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, + gemm_info); + } + _assembly_path = _asm_glue.is_configured(); + break; + } + default: + { + ARM_COMPUTE_ERROR("Datatype not supported"); + break; + } + } +#endif // 0 + ARM_COMPUTE_ERROR("aarch64 not supported"); +#endif /* __aarch64__ */ + if (!(_assembly_path || _run_vector_matrix_multiplication)) + { + matrix_a = &_tmp_a; + matrix_b = &_tmp_b; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / + // 4.0f) ] + TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, + a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / + // 16.0f) ] + TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), + b->info()->quantization_info()); + _tmp_a.allocator()->init(a_info); + _tmp_b.allocator()->init(b_info); + _memory_group.manage(&_tmp_a); + if (!_reshape_b_only_on_first_run) + { + _memory_group.manage(&_tmp_b); + } + + // Configure interleave kernel + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>(); + k->configure(a_to_use, &_tmp_a); + _mtx_a_reshape_kernel = std::move(k); + } + + // Configure transpose kernel + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>(); + k->configure(b, &_tmp_b); + _mtx_b_reshape_kernel = std::move(k); + } + } + + if (!_fused_assembly_path) + { + // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0) + { + TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); + + _vector_sum_col.allocator()->init(info_vector_sum_col); + if (!_reshape_b_only_on_first_run) + { + _memory_group.manage(&_vector_sum_col); + } + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false); + } + + // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); + + _vector_sum_row.allocator()->init(info_vector_sum_row); + _memory_group.manage(&_vector_sum_row); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), + false); + } + + if (_fuse_output_stage) + { + // Configure matrix multiply kernel + if (!_assembly_path) + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); + k->configure(matrix_a, matrix_b, &_mm_result_s32); + _mm_kernel = std::move(k); + } + + _offset_contribution_output_stage_kernel.configure( + &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, + _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset, + _b_offset, info.gemmlowp_output_stage()); + } + else + { + // Configure matrix multiply kernel + if (!_assembly_path) + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); + k->configure(matrix_a, matrix_b, output); + _mm_kernel = std::move(k); + } + // Configure offset contribution kernel + _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, + a_to_use->info()->dimension(0), _a_offset, _b_offset); + } + } + + // Allocate tensors + if (!_assembly_path && !_run_vector_matrix_multiplication) + { + _tmp_a.allocator()->allocate(); + if (!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } + } + + if (!_fused_assembly_path) + { + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + } + + if (_b_offset != 0) + { + _vector_sum_row.allocator()->allocate(); + } + } + + if (_fuse_output_stage) + { + _mm_result_s32.allocator()->allocate(); + } +} + +Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b, + const ITensorInfo *c, const ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, + "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is " + "equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), + "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), + "Matrix B already reshaped is not supported"); + + GEMMInfo info = gemm_info; + const ITensorInfo *matrix_a_info = a; + const ITensorInfo *matrix_b_info = b; + + const ITensorInfo *a_to_use = a; + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + TensorInfo mm_result_s32_info{}; + + int32_t a_offset = a->quantization_info().offset; + int32_t b_offset = b->quantization_info().offset; + + bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; + if (fuse_output_stage) + { + auto_init_if_empty( + mm_result_s32_info, + a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + } + + // Check if we need to run the optimized assembly kernel + bool run_optimised = false; + bool run_optimised_requantized = false; + const bool reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); + if (a_to_use->data_type() == DataType::QASYMM8 && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, output, 1.f, 0.f, + reshape_b_only_on_first_run)); + run_optimised_requantized = run_optimised; + } + else + { + run_optimised = bool(NEGEMMAssemblyDispatch::validate( + a_to_use, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, + reshape_b_only_on_first_run)); + } + + if (run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); + if (info.depth_output_gemm3d() != 0) + { + if (info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), + "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, + "NEGEMM cannot reinterpret the output tensor as 3D"); + + const bool run_vector_matrix_multiplication = a->dimension(1) < 2; + if (!run_vector_matrix_multiplication) + { + matrix_a_info = &tmp_a_info; + matrix_b_info = &tmp_b_info; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / + // 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width + // / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + // Validate interleave kernel + auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); + } + } + + if (!run_optimised_requantized) + { + TensorInfo info_vector_sum_col{}; + TensorInfo info_vector_sum_row{}; + + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if (a_offset != 0) + { + info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate( + b, &info_vector_sum_col, a->dimension(0), false)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if (b_offset != 0) + { + info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate( + a_to_use, &info_vector_sum_row, a->dimension(0), false)); + } + + if (fuse_output_stage) + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, + info.gemmlowp_output_stage())); + } + else + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); + } + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset)); + } + } + return Status{}; +} + +void NEGEMMLowpMatrixMultiplyCoreEx::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Reshape inputs + if (_mtx_a_reshape_kernel) + { + NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); + } + if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run) + { + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + } + + // Run GEMM + if (_asm_glue.is_configured()) + { + _asm_glue.run(); + } + else + { + NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + } + + if (!_fused_assembly_path) + { + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + if (_fuse_output_stage) + { + // Run offset contribution kernel + NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY); + } + else + { + // Run offset contribution kernel + NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); + } + } +} + +void NEGEMMLowpMatrixMultiplyCoreEx::prepare() +{ + if (!_is_prepared) + { + // Run assembly reshape + if (_asm_glue.is_configured() && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + _asm_glue.prepare(); + _original_b->mark_as_unused(); + } + // Run non-assembly reshape + else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + // Run reshape kernel and mark original weights tensor as unused + _tmp_b.allocator()->allocate(); + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + _original_b->mark_as_unused(); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && _reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp new file mode 100644 index 000000000..90dabb35a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGatherEx.h" + +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ +void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) +{ + auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return NEGatherKernelEx::validate(input, indices, output, axis); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp new file mode 100644 index 000000000..624185d2c --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" + +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, + ITensor *output, ITensor *hits) +{ + auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} + +Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..1c2c8f027 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), + _permute_input(), _permute_output(), _permuted_input(), _permuted_output() +{ +} + +void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma, + ITensor *beta, float epsilon) +{ + const DataLayout data_layout = input->info()->data_layout(); + + // Configure Kernels + _is_nchw = data_layout == DataLayout::NCHW; + + if (!_is_nchw) + { + _memory_group.manage(&_permuted_input); + _memory_group.manage(&_permuted_output); + + // Configure the function to transform the input tensor from NHWC -> NCHW + _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permuted_input.info()->set_data_layout(DataLayout::NCHW); + + _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon); + _permuted_output.info()->set_data_layout(DataLayout::NCHW); + + _permute_output.configure(&_permuted_output, output != nullptr ? output : input, + PermutationVector(2U, 0U, 1U)); + _permuted_input.allocator()->allocate(); + _permuted_output.allocator()->allocate(); + } + else + { + _normalization_kernel.configure(input, output, gamma, beta, epsilon); + } +} + +Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return NEInstanceNormalizationLayerKernelEx::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), + &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); +} + +void NEInstanceNormalizationLayerEx::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + // Permute input + if (!_is_nchw) + { + _permute_input.run(); + } + + NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ); + + // Permute output + if (!_is_nchw) + { + _permute_output.run(); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp new file mode 100644 index 000000000..1150cef76 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEPReLU.h" + +#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +using namespace arm_compute; + +void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>(); + k->configure(input, alpha, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp new file mode 100644 index 000000000..84411c266 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), + _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), + _gemm_output(), _add_output(), _is_prepared(false) +{ +} + +Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, + output); + + const int idx_width = 0; + const int idx_height = 1; + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != + recurrent_weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != + recurrent_weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + hidden_state->tensor_shape()); + + auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape( + recurrent_weights, hidden_state->dimension(idx_height)), + 1, input->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate( + &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info)); + + return Status{}; +} + +void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights, + const ITensor *recurrent_weights, const ITensor *bias, + ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(), + recurrent_weights->info(), bias->info(), + hidden_state->info(), output->info(), info)); + + const int idx_height = 1; + TensorShape shape = misc::shape_calculator::compute_rnn_shape( + recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + + _is_prepared = false; + + // Manage intermediate buffers and configure + _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + + // Manage intermediate buffers and configure + _memory_group.manage(&_fully_connected_out); + _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); + + _memory_group.manage(&_gemm_output); + _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); + + _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _memory_group.manage(&_add_output); + + _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, + ConvertPolicy::SATURATE); + + _fully_connected_out.allocator()->allocate(); + _gemm_output.allocator()->allocate(); + + _activation_kernel.configure(&_add_output, hidden_state, info); + _add_output.allocator()->allocate(); + + _copy_kernel.configure(hidden_state, output); +} + +void NERNNLayerEx::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + _fully_connected_kernel.run(); + + _gemm_state_f.run(); + + NEScheduler::get().schedule(&_add_kernel, Window::DimY); + NEScheduler::get().schedule(&_activation_kernel, Window::DimY); + + // copy hidden out to output + NEScheduler::get().schedule(&_copy_kernel, Window::DimY); +} + +void NERNNLayerEx::prepare() +{ + if (!_is_prepared) + { + _fully_connected_kernel.prepare(); + _gemm_state_f.prepare(); + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp new file mode 100644 index 000000000..c65e93570 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels = + arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops); + _reduced_outs = + arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = i == 0 ? input->info()->tensor_shape() + : (_reduced_outs.get() + i - 1)->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(output->info()->data_layout())); + _memory_group.manage(_reduced_outs.get() + i); + _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], + ReductionOperation::MEAN_SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output); + } +} + +void NEReduceMeanEx::run() +{ + _memory_group.acquire(); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } + _memory_group.release(); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp new file mode 100644 index 000000000..b36f8287a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], op); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp new file mode 100644 index 000000000..3c18217ef --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceSum.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(input->info()->data_layout())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], + ReductionOperation::SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceSum::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp new file mode 100644 index 000000000..c3431c418 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +namespace +{ +/** Define dimension to split the window + * + * @param[in] axis Reduction axis + * + * @return The dimension to split the window + */ +size_t reduction_window_split_dimension(unsigned int axis) +{ + switch (axis) + { + case 0: + return Window::DimY; + case 1: + case 2: + case 3: + return Window::DimX; + default: + ARM_COMPUTE_ERROR("Unsupported reduction axis"); + } +} +} // namespace + +NEReductionOperationEx::NEReductionOperationEx() + : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis() +{ +} + +Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op)); + + return Status{}; +} + +void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis, + ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + NEReductionOperationEx::validate(input->info(), output->info(), axis, op)); + + // Configure reduction kernel + _reduction_kernel.configure(input, output, axis, op); + _window_split = reduction_window_split_dimension(axis); + _reduction_axis = axis; + + if (axis == 0) + { + // Configure fill border kernel + const BorderSize fill_border_size = _reduction_kernel.border_size(); + PixelValue pixelValue; + switch (op) + { + case ReduceOperation::MIN: + { + switch (input->info()->data_type()) + { + case DataType::F32: + { + pixelValue = PixelValue(std::numeric_limits<float>::max()); + break; + } + case DataType::F16: + { + pixelValue = PixelValue(static_cast<half>(65504.0f)); + break; + } + case DataType::QASYMM8: + { + pixelValue = + PixelValue(255, input->info()->data_type(), input->info()->quantization_info()); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported DataType"); + } + } + break; + } + case ReduceOperation::MAX: + { + switch (input->info()->data_type()) + { + case DataType::F32: + { + pixelValue = PixelValue(-std::numeric_limits<float>::max()); + break; + } + case DataType::F16: + { + pixelValue = PixelValue(static_cast<half>(-65504.0f)); + break; + } + case DataType::QASYMM8: + { + pixelValue = + PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported DataType"); + } + } + break; + } + default: + ARM_COMPUTE_ERROR("Reduction Operation unsupported"); + } + _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue); + } +} + +void NEReductionOperationEx::run() +{ + if (_reduction_axis == 0) + { + NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); + } + NEScheduler::get().schedule(&_reduction_kernel, _window_split); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp new file mode 100644 index 000000000..c9f914fb0 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NESpaceToBatchLayerEx::NESpaceToBatchLayerEx() + : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false) +{ +} + +void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape, + const ITensor *paddings, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); + + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + { + _has_padding = true; + _memset_kernel.configure( + output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); + } + _space_to_batch_kernel.configure(input, block_shape, paddings, output); +} + +void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x, + const int block_shape_y, const Size2D &padding_left, + const Size2D &padding_right, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + { + _has_padding = true; + _memset_kernel.configure( + output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); + } + _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, + output); +} + +Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape, + const ITensorInfo *paddings, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); + + return Status{}; +} + +Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x, + const int block_shape_y, const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate( + input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + + return Status{}; +} + +void NESpaceToBatchLayerEx::run() +{ + // Zero out output only if we have paddings + if (_has_padding) + { + NEScheduler::get().schedule(&_memset_kernel, Window::DimY); + } + NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp new file mode 100644 index 000000000..b6ae21cc0 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) +{ + auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>(); + k->configure(input, output, block_shape); + _kernel = std::move(k); +} + +Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape)); + return Status{}; +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp new file mode 100644 index 000000000..fd15ef05f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _conv_f(), + _upsample_f(), + _flip_weights(), + _permute_input(), + _permute_weights(), + _permute_output(), + _scaled_output(), + _weights_flipped(), + _permuted_input(), + _permuted_weights(), + _permuted_output(), + _is_nchw(false), + _original_weights(nullptr), + _input(nullptr), + _info(), + _is_prepared(false) +{ +} + +Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); + const unsigned int width_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); + + auto out_dims = transposeconv_output_dimensions( + input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), + weights->dimension(height_idx), info, invalid_right, invalid_bottom); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + if (is_data_type_quantized_asymmetric(input->data_type()) && bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else if (bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(), + "Output's dim 0 is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(), + "Output's dim 1 is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(), + "Output's dim 2 is invalid."); + } + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info( + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + scale_out_info.set_data_layout(input->data_layout()); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != + scale_out_info.dimension(batches_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != + scale_out_info.dimension(channel_idx)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, WeightsInfo())); + + return Status{}; +} + +void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, + ITensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + const DataLayout data_layout = input->info()->data_layout(); + + _input = input; + _original_weights = weights; + _info = info; + _is_prepared = false; + _is_nchw = data_layout == DataLayout::NCHW; + + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const unsigned int width_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, + invalid_right, invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _memory_group.manage(&_scaled_output); + + if (!_is_nchw) + { + _memory_group.manage(&_permuted_input); + _memory_group.manage(&_permuted_weights); + _memory_group.manage(&_permuted_output); + + // Configure the function to transform the input tensor from NHWC -> NCHW + _permuted_input.info()->set_quantization_info(input->info()->quantization_info()); + _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permuted_input.info()->set_data_layout(DataLayout::NCHW); + + // Configure the function to transform the weights tensor from NHWC -> NCHW + _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info()); + _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); + _permuted_weights.info()->set_data_layout(DataLayout::NCHW); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in + // order to match output shape + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(), + _permuted_input.info()->quantization_info()); + scale_out_info.set_data_layout(DataLayout::NCHW); + _scaled_output.allocator()->init(scale_out_info); + + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::CEIL); + _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info); + + _weights_flipped.allocator()->init(*_permuted_weights.info()->clone()); + _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info()); + _flip_weights.configure(&_permuted_weights, &_weights_flipped); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const auto out_shape = output->info()->tensor_shape(); + TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]}; + TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(), + output->info()->quantization_info()); + _permuted_output.allocator()->init(permuted_out_info); + _permuted_output.info()->set_data_layout(DataLayout::NCHW); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info); + + // Configure the function to transform the convoluted output to NHWC + _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); + + _permuted_input.allocator()->allocate(); + _permuted_weights.allocator()->allocate(); + _permuted_output.allocator()->allocate(); + } + else + { + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in + // order to match output shape + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + _scaled_output.allocator()->init(scale_out_info); + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _upsample_f.configure(input, &_scaled_output, upsample_info); + + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); + } + _scaled_output.allocator()->allocate(); +} + +void NETransposeConvLayer::run() +{ + prepare(); + + // MemoryGroupResourceScope scope_mg(_memory_group); + + // Permute input + if (!_is_nchw) + { + _permute_input.run(); + } + + _upsample_f.run(); + _conv_f.run(); + + // Permute output + if (!_is_nchw) + { + _permute_output.run(); + } +} + +void NETransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + // Permute weights + if (!_is_nchw) + { + _permute_weights.run(); + } + NEScheduler::get().schedule(&_flip_weights, Window::DimZ); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp new file mode 100644 index 000000000..67e1bfb02 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/misc/functions/GenericGather.h" + +namespace arm_compute +{ +namespace misc +{ + +bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output) +{ + return (input->num_dimensions() != 4 && output->num_dimensions() == 4 && + input->data_layout() == DataLayout::NCHW); +} + +void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices, + arm_compute::ITensor *output, int axis) +{ + _input = input; + _indices = indices; + _output = output; + _axis = axis; + + arm_compute::PermutationVector pv; + if (shouldPermute(input->info(), output->info())) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H / + // C + // + // Original | Permuted + // 0 | C | W (from 1) + // 1 | W | H (from 2) + // 2 | H | C (from 0) + // + pv = arm_compute::PermutationVector{1, 2, 0}; + } + + if (utils::isGpuMode()) + { + if (shouldPermute(input->info(), output->info())) + { + _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis); + _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis); + } + } + else + { + throw std::runtime_error("Not supported, yet"); + } +} + +void GenericGather::run(void) +{ + if (utils::isGpuMode()) + { + _cl_gather.run(); + if (shouldPermute(_input->info(), _output->info())) + { + _cl_permute.run(); + } + } + else + { + throw std::runtime_error("Not supported, yet"); + } +} + +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp new file mode 100644 index 000000000..8025ae28e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/runtime/misc/functions/GenericReshapeLayer.h" + +namespace arm_compute +{ +namespace misc +{ + +namespace +{ + +bool shouldPermute(const arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output) +{ + return (input->num_dimensions() == 4 || output->num_dimensions() == 4) && + (input->num_dimensions() != output->num_dimensions() && + input->data_layout() == DataLayout::NCHW); +} + +} // namespace + +void GenericReshapeLayer::configure(const arm_compute::ITensor *input, arm_compute::ITensor *output) +{ + _input = input; + _output = output; + + arm_compute::PermutationVector pv; + if (input->info()->data_layout() == DataLayout::NCHW && input->info()->num_dimensions() == 4 && + output->info()->num_dimensions() != 4) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape W / H / C into another tensor of shape + // C / W / H + // + // Original | Permuted + // 0 | W | C (from 2) + // 1 | H | W (from 0) + // 2 | C | H (from 1) + // + pv = arm_compute::PermutationVector{2, 0, 1}; + } + else if (input->info()->data_layout() == DataLayout::NCHW && + input->info()->num_dimensions() != 4 && output->info()->num_dimensions() == 4) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape C / W / H into another tensor of shape + // W / H / C + // + // Original | Permuted + // 0 | C | W (from 1) + // 1 | W | H (from 2) + // 2 | H | C (from 0) + // + pv = arm_compute::PermutationVector{1, 2, 0}; + } + + if (utils::isGpuMode()) + { + const auto const_input = CAST_CL(const_cast<arm_compute::ITensor *>(input)); + if (shouldPermute(input->info(), output->info())) + { + _cl_permute.configure(const_input, &_cl_permuted, pv); + _cl_reshape.configure(&_cl_permuted, CAST_CL(output)); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _cl_reshape.configure(const_input, CAST_CL(output)); + } + } + else + { + if (shouldPermute(input->info(), output->info())) + { + _neon_permute.configure(input, &_neon_permuted, pv); + _neon_reshape.configure(&_neon_permuted, output); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _neon_permuted.allocator()->allocate(); + } + else + { + _neon_reshape.configure(input, output); + } + } +} + +void GenericReshapeLayer::run(void) +{ + if (utils::isGpuMode()) + { + if (shouldPermute(_input->info(), _output->info())) + { + _cl_permute.run(); + } + _cl_reshape.run(); + } + else + { + if (shouldPermute(_input->info(), _output->info())) + { + _neon_permute.run(); + } + _neon_reshape.run(); + } +} + +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp new file mode 100644 index 000000000..44a4bb9ed --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/misc/functions/Utils.h" + +namespace arm_compute +{ +namespace misc +{ +namespace utils +{ + +bool isGpuMode() +{ + char *neon = std::getenv("NEON"); + if (neon == nullptr) + return true; + else if (neon[0] == '1') + return false; + return true; +} + +} // namespace utils +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h new file mode 100644 index 000000000..f94effea1 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/topk_v2.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file topk_v2.h + * @brief This file contains TopK method and TopContainer class for TopK operation + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ +#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ + +typedef int32_t int32; + +namespace nnfw +{ +namespace rt +{ +namespace optimized_ops +{ +/** + * @brief class to define TopK operation + * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. + * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than + * TFLite. + * (TFLite additionaly supports kTfLiteInt64.) + * + * The class that collects top indexes of k values. Based on template + * tensorflow::gtl::TopN<> but, for optimization, + * it re-uses the same container. + */ +template <typename T> class TopContainer +{ +public: + /** + * @brief Prevent default constructor of of this class + */ + TopContainer() = delete; + /** + * @brief Constructor with params + * @param [in] row_size Size of row in data + * @param [in] k The top k predictions + */ + TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) + { + container_.reserve(std::min(k, row_size) + 1); + } + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + */ + TopContainer(const TopContainer &) = delete; + /* + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + * @return Reference of TopContainer + */ + TopContainer &operator=(const TopContainer &) = delete; + + /** + * @brief Start collecting + * @param [in] values To set as values + * @return N/A + */ + void start_collecting(const T *values) + { + values_ = values; + container_.clear(); + } + + /** + * @brief Push a value to be compared for topk + * @param [in] a A value to compare + * @return N/A + */ + void push(int32 a) + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)k_) + { + container_.push_back(a); + if (container_.size() == (size_t)(k_ + 1)) + { + std::make_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + else if (comparator(a, container_.front())) + { + container_.back() = a; + std::push_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + + /** + * @brief Get sorted result from pushed values + * @return Reference of vector with sorted values + */ + const std::vector<int32> &sorted_result() + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)(k_)) + { + std::sort(container_.begin(), container_.end(), comparator); + } + else + { + std::sort_heap(container_.begin(), container_.end() - 1, comparator); + container_.resize(k_); + } + return container_; + } + +private: + int32 k_; + std::vector<int32> container_; + const T *values_ = nullptr; + + bool compare_fun(int32 a, int32 b) const + { + if (values_[b] < values_[a]) + { + return true; + } + else if (values_[b] > values_[a]) + { + return false; + } + else + { + return a < b; + } + } +}; + +/** + * @brief Operates TopK operation with params + * @param [in] row_size Size of row in data + * @param [in] num_rows The number of rows in data + * @param [in] data To be operated in + * @param [in] k The top k predictions + * @param [out] output_indexes Indexes of targets in the top k predictions + * @param [out] output_values Values of targets in the top k predictions + * @return N/A + */ +template <typename T> +void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, + T *output_values) +{ + TopContainer<T> topc(k, row_size); + for (int row = 0; row < num_rows; ++row) + { + const T *values_row = data + row * row_size; + topc.start_collecting(values_row); + for (int32 c = 0; c < row_size; ++c) + { + topc.push(c); + } + + // Prepare output buffers. + int32 *indexes_row = output_indexes + row * k; + T *output_row = output_values + row * k; + // We always assume that the output is sorted. + const auto &top_k = topc.sorted_result(); + std::copy(top_k.begin(), top_k.end(), indexes_row); + std::transform(top_k.begin(), top_k.end(), output_row, + [values_row](const int32 loc) { return values_row[loc]; }); + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ |